In [2]:
import pandas as pd
import numpy as np
In [3]:
cic_df=pd.read_parquet("..//cic/cic-collection.parquet")
In [4]:
cic_df.head()
Out[4]:
Flow Duration Total Fwd Packets Total Backward Packets Fwd Packets Length Total Bwd Packets Length Total Fwd Packet Length Max Fwd Packet Length Mean Fwd Packet Length Std Bwd Packet Length Max Bwd Packet Length Mean ... Active Mean Active Std Active Max Active Min Idle Mean Idle Std Idle Max Idle Min Label ClassLabel
0 4 2 0 12.0 0.0 6.0 6.00000 0.000000 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign Benign
1 1 2 0 12.0 0.0 6.0 6.00000 0.000000 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign Benign
2 3 2 0 12.0 0.0 6.0 6.00000 0.000000 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign Benign
3 1 2 0 12.0 0.0 6.0 6.00000 0.000000 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign Benign
4 609 7 4 484.0 414.0 233.0 69.14286 111.967896 207.0 103.5 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign Benign

5 rows × 59 columns

In [5]:
cic_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9167581 entries, 0 to 9167580
Data columns (total 59 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   Flow Duration             int64  
 1   Total Fwd Packets         int32  
 2   Total Backward Packets    int32  
 3   Fwd Packets Length Total  float64
 4   Bwd Packets Length Total  float64
 5   Fwd Packet Length Max     float64
 6   Fwd Packet Length Mean    float32
 7   Fwd Packet Length Std     float32
 8   Bwd Packet Length Max     float64
 9   Bwd Packet Length Mean    float32
 10  Bwd Packet Length Std     float32
 11  Flow Bytes/s              float64
 12  Flow Packets/s            float64
 13  Flow IAT Mean             float32
 14  Flow IAT Std              float32
 15  Flow IAT Max              float64
 16  Flow IAT Min              float64
 17  Fwd IAT Total             float64
 18  Fwd IAT Mean              float32
 19  Fwd IAT Std               float32
 20  Fwd IAT Max               float64
 21  Fwd IAT Min               float64
 22  Bwd IAT Total             float64
 23  Bwd IAT Mean              float32
 24  Bwd IAT Std               float32
 25  Bwd IAT Max               float64
 26  Bwd IAT Min               float64
 27  Fwd PSH Flags             int8   
 28  Fwd Header Length         int64  
 29  Bwd Header Length         int64  
 30  Fwd Packets/s             float32
 31  Bwd Packets/s             float32
 32  Packet Length Max         float64
 33  Packet Length Mean        float32
 34  Packet Length Std         float32
 35  Packet Length Variance    float32
 36  SYN Flag Count            int8   
 37  URG Flag Count            int8   
 38  Avg Packet Size           float32
 39  Avg Fwd Segment Size      float32
 40  Avg Bwd Segment Size      float32
 41  Subflow Fwd Packets       int32  
 42  Subflow Fwd Bytes         int32  
 43  Subflow Bwd Packets       int32  
 44  Subflow Bwd Bytes         int32  
 45  Init Fwd Win Bytes        int32  
 46  Init Bwd Win Bytes        int32  
 47  Fwd Act Data Packets      int32  
 48  Fwd Seg Size Min          int32  
 49  Active Mean               float32
 50  Active Std                float32
 51  Active Max                float64
 52  Active Min                float64
 53  Idle Mean                 float32
 54  Idle Std                  float32
 55  Idle Max                  float64
 56  Idle Min                  float64
 57  Label                     object 
 58  ClassLabel                object 
dtypes: float32(22), float64(19), int32(10), int64(3), int8(3), object(2)
memory usage: 2.8+ GB
In [6]:
cic_df.shape
Out[6]:
(9167581, 59)
In [7]:
cic_df.isna().sum()
Out[7]:
Flow Duration               0
Total Fwd Packets           0
Total Backward Packets      0
Fwd Packets Length Total    0
Bwd Packets Length Total    0
Fwd Packet Length Max       0
Fwd Packet Length Mean      0
Fwd Packet Length Std       0
Bwd Packet Length Max       0
Bwd Packet Length Mean      0
Bwd Packet Length Std       0
Flow Bytes/s                0
Flow Packets/s              0
Flow IAT Mean               0
Flow IAT Std                0
Flow IAT Max                0
Flow IAT Min                0
Fwd IAT Total               0
Fwd IAT Mean                0
Fwd IAT Std                 0
Fwd IAT Max                 0
Fwd IAT Min                 0
Bwd IAT Total               0
Bwd IAT Mean                0
Bwd IAT Std                 0
Bwd IAT Max                 0
Bwd IAT Min                 0
Fwd PSH Flags               0
Fwd Header Length           0
Bwd Header Length           0
Fwd Packets/s               0
Bwd Packets/s               0
Packet Length Max           0
Packet Length Mean          0
Packet Length Std           0
Packet Length Variance      0
SYN Flag Count              0
URG Flag Count              0
Avg Packet Size             0
Avg Fwd Segment Size        0
Avg Bwd Segment Size        0
Subflow Fwd Packets         0
Subflow Fwd Bytes           0
Subflow Bwd Packets         0
Subflow Bwd Bytes           0
Init Fwd Win Bytes          0
Init Bwd Win Bytes          0
Fwd Act Data Packets        0
Fwd Seg Size Min            0
Active Mean                 0
Active Std                  0
Active Max                  0
Active Min                  0
Idle Mean                   0
Idle Std                    0
Idle Max                    0
Idle Min                    0
Label                       0
ClassLabel                  0
dtype: int64
In [8]:
cic_df[cic_df.duplicated()]
Out[8]:
Flow Duration Total Fwd Packets Total Backward Packets Fwd Packets Length Total Bwd Packets Length Total Fwd Packet Length Max Fwd Packet Length Mean Fwd Packet Length Std Bwd Packet Length Max Bwd Packet Length Mean ... Active Mean Active Std Active Max Active Min Idle Mean Idle Std Idle Max Idle Min Label ClassLabel
321783 1 2 0 12.0 0.0 6.0 6.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign Benign
346078 2 3 0 18.0 0.0 6.0 6.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign Benign
361731 2 1 1 6.0 6.0 6.0 6.0 0.0 6.0 6.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign Benign
366568 4 2 0 12.0 0.0 6.0 6.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign Benign
369416 3 3 0 18.0 0.0 6.0 6.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign Benign
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
9030704 1 3 0 18.0 0.0 6.0 6.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign Benign
9036659 158 1 1 6.0 6.0 6.0 6.0 0.0 6.0 6.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign Benign
9111903 644 1 1 6.0 6.0 6.0 6.0 0.0 6.0 6.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign Benign
9134426 428 1 1 6.0 6.0 6.0 6.0 0.0 6.0 6.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign Benign
9163553 11 2 1 12.0 6.0 6.0 6.0 0.0 6.0 6.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign Benign

310 rows × 59 columns

In [9]:
cic_df['Label'].value_counts()
Out[9]:
Label
Benign                  7186189
DDoS-LOIC-HTTP           575364
DoS-Hulk                 318740
DDoS-HOIC                198861
Botnet                   145968
DDoS                     128062
DDoS-NTP                 121328
DDoS-TFTP                 98833
Bruteforce-SSH            97260
Infiltration              94857
DoS-Goldeneye             52324
DDoS-Syn                  47757
DDoS-UDP                  28863
DoS-Slowloris             15243
DDoS-MSSQL                11784
DDoS-UDPLag                8452
Bruteforce-FTP             5984
DoS-Slowhttptest           5271
DDoS-Ddossim               5115
DDoS-DNS                   3668
DoS-Slowread               2786
Portscan                   2255
DDoS-LDAP                  2092
Webattack-bruteforce       2020
DDoS-SNMP                  2017
DDoS-Slowloris             1858
DoS-Slowheaders            1649
Webattack-XSS               876
DoS-Rudy                    699
DDoS-NetBIOS                675
DoS-Slowbody                621
Webattack-SQLi               99
DoS-Heartbleed               11
Name: count, dtype: int64
In [10]:
cic_df['ClassLabel'].value_counts()
Out[10]:
ClassLabel
Benign          7186189
DDoS            1234729
DoS              397344
Botnet           145968
Bruteforce       103244
Infiltration      94857
Webattack          2995
Portscan           2255
Name: count, dtype: int64
In [11]:
#Removing the duplicate records
cic_df.drop_duplicates(inplace=True)
In [12]:
cic_df.shape
Out[12]:
(9167271, 59)
  • Number of rows with duplicate records: 9167581
  • Number of rows after removal of duplicate records: 9167271
In [13]:
cic_df['Label'].value_counts()
Out[13]:
Label
Benign                  7185881
DDoS-LOIC-HTTP           575364
DoS-Hulk                 318740
DDoS-HOIC                198861
Botnet                   145968
DDoS                     128062
DDoS-NTP                 121326
DDoS-TFTP                 98833
Bruteforce-SSH            97260
Infiltration              94857
DoS-Goldeneye             52324
DDoS-Syn                  47757
DDoS-UDP                  28863
DoS-Slowloris             15243
DDoS-MSSQL                11784
DDoS-UDPLag                8452
Bruteforce-FTP             5984
DoS-Slowhttptest           5271
DDoS-Ddossim               5115
DDoS-DNS                   3668
DoS-Slowread               2786
Portscan                   2255
DDoS-LDAP                  2092
Webattack-bruteforce       2020
DDoS-SNMP                  2017
DDoS-Slowloris             1858
DoS-Slowheaders            1649
Webattack-XSS               876
DoS-Rudy                    699
DDoS-NetBIOS                675
DoS-Slowbody                621
Webattack-SQLi               99
DoS-Heartbleed               11
Name: count, dtype: int64
  • Number of rows with Label=Benign prior removal of duplicates: 7186189.
  • Number of rows with Label=Benign after removal of duplicates: 7185881.
  • Number of rows with Label=DDoS-NTP prior removal of duplicates: 121328.
  • Number of rows with Label=DDOS-NTP after removal of duplicates: 121326.

Thus: -

  • 0.0042% of duplicate records for Label=Benign were removed.
  • 0.0016% of duplicate records for Label=DDOS-NTP were removed.

We observed that very small proportion of duplicate records were removed for both dominant class and minority class in the dataset; Overall the distribution of rows with respect to field: Label has remained same.

In [14]:
cic_df['ClassLabel'].value_counts()
Out[14]:
ClassLabel
Benign          7185881
DDoS            1234727
DoS              397344
Botnet           145968
Bruteforce       103244
Infiltration      94857
Webattack          2995
Portscan           2255
Name: count, dtype: int64
  • Number of rows with ClassLabel=Benign prior removal of duplicates: 7186189.
  • Number of rows with ClassLabel=Benign after removal of duplicates: 7185881.
  • Number of rows with ClassLabel=DDOS prior removal of duplicates: 1234729.
  • Number of rows with ClassLabel=DDOS after removal of duplicates: 1234727.

Thus: -

  • 0.0042% of duplicate records for ClassLabel=Benign were removed.
  • 0.00016% of duplicate records for ClassLabel=DDOS were removed.

We observed that very small proportion of duplicate records were removed for both dominant class and minority class in the dataset; Overall the distribution of rows with respect to field: ClassLabel has remained same.

In [15]:
import matplotlib.pyplot as plt
In [16]:
axes=cic_df.hist(bins=50, figsize=(50,50))
for ax, col in zip(axes.flatten(), cic_df.columns):
    ax.set_xlabel("Values")
    ax.set_ylabel("Frequency")
    ax.set_title(col)
plt.show()
No description has been provided for this image
In [17]:
axes=cic_df.hist(bins=50, figsize=(50,50), log=True)
for ax, col in zip(axes.flatten(), cic_df.columns):
    ax.set_xlabel("Values")
    ax.set_ylabel("Frequency w.r.t log scale")
    ax.set_title(col)
plt.show()
No description has been provided for this image
In [18]:
# Get the list of columns excluding 'Label' and 'ClassLabel'
columns = [col for col in cic_df.columns if col not in ['Label', 'ClassLabel']]

# Create subplots for each column
fig, axes = plt.subplots(nrows=len(columns), ncols=1, figsize=(10, len(columns) * 5))

# Plot each column's histogram in a separate subplot
for i, column in enumerate(columns):
    cic_df[column].hist(bins=50, ax=axes[i])
    axes[i].set_xlabel("Values")
    axes[i].set_ylabel("Frequency")
    axes[i].set_title(column)

plt.tight_layout()
plt.show()
No description has been provided for this image
  • In the above graphs, the scale is too vast to fit all values and observe the distributions for each feature in the dataset.
  • Thus, below we shall plot the graphs with Logarithmic scale. It will not give us actual values, but will preserve the distribution pattern and give us more understanding about each feature.
In [19]:
# Create subplots for each column
fig, axes = plt.subplots(nrows=len(columns), ncols=1, figsize=(10, len(columns) * 5))

# Plot each column's histogram in a separate subplot
for i, column in enumerate(columns):
    cic_df[column].hist(bins=50, ax=axes[i], log=True)
    axes[i].set_xlabel("Values")
    axes[i].set_ylabel("Frequency w.r.t log scale")
    axes[i].set_title(column)

plt.tight_layout()
plt.show()
No description has been provided for this image

Observations and interpretations from above Histograms with Logarithmic scale: -

  1. Flow Duration: The duration of the flow

    • We observed negative values on X-axis, thus, we need to check the actual values under the column to determine if data is accurate or invalid.
    • Peak was observed at extreme right, Flow Duration=0.
    • There are some scattered bins of count=1
  2. Total Fwd Packets: Total number of forward packets

    • The distribution is skewed towards right: Positively skewed.
    • Peak was observed on first bin from the left, after which we saw sharp decline.
    • There is another small peak around Total Fwd Packets=125000, but it is in plateau shape. Thus, we see many values around 125000.
    • The first bin (Peak) is in the range around 0 to 6250.
    • After the second bin , there is consistent decline.
    • Since there are two peaks at significant distance apart, we can also call the graph bi-modal.
    • We observed value for Total Fwd Packets>300000. This may indicate outlier in the data.
  3. Total Backward Packets: Total number of backward packets

    • The distribution is skewed towards right: Positively skewed.
    • Peak was observed on first bin from left, Total Backward Packets: 0 to 6250.
    • After the peak, there is significant decline in results.
    • Some records were observed at regular intervals but with very less frequency.
  4. Fwd Packets Length Total: Total length of forward packets

    • Peak was observed on first bin from left.
    • Most values are stacked on the left side of X-axis and they continuously decline as we move towards right hand side of X-axis.
    • There are a couple of observations at a distance on right hand side after long gap. They may indicate outliers in the data.
  5. Bwd Packets Length Total: Total length of backward packets

    • The distribution is skewed towards right: Positively skewed.
    • Peak was observed on first bin from left.
    • After the peak, there is significant decline in results.
    • There are some observations spread out on X-axis, but all have frequency less than 10.
  6. Fwd Packet Length Max: Maximum length of forward packets

    • The distribution is skewed towards right: Positively skewed.
    • Peak was observed on first bin from left.
    • The first bin (Peak) is in the range around 0 to 1250.
    • Most number of observations lie between Fwd Packet Length Max>0 and Fwd Packet Length Max<10000.
    • A small peak was observed around Fwd Packet Length Max>20000 and Fwd Packet Length Max<300000. However the frequency is relatively very less compared to the peak observed in first bin.
    • There are some observations around Fwd Packet Length Max=60000. This may indicate outliers in the data.
  7. Fwd Packet Length Mean: Mean length of forward packets

    • The distribution is skewed towards right: Positively skewed.
    • Peak was observed on first bin from left.
    • Most number of observations lie between Fwd Packet Length Mean>=0 and Fwd Packet Length Mean<=2500.
    • There are some small number of observations around Fwd Packet Length Mean=15000 and above. This may indicate outliers in the data.
  8. Fwd Packet Length Std: Standard deviation length of forward packets

    • The distribution is skewed towards right: Positively skewed.
    • Peak was observed on first bin from left.
    • Most number of observations lie between Fwd Packet Length Std>=0 and Fwd Packet Length Std<=5000.
    • There are some very small number of observations at Fwd Packet Length Std>7500. This may indicate outliers in the data.
  9. Bwd Packet Length Max: Maximum length of backward packets

    • The distribution is skewed towards right: Positively skewed.
    • Peak was observed on first bin from left. Peak lies around Bwd Packet Length Max>=0 and Bwd Packet Length Max<=1250.
    • Most number of observations lie between Bwd Packet Length Max>=0 and Bwd Packet Length Max<=10000.
    • There are few observations in the range: - Bwd Packet Length Max>=11250 and Bwd Packet Length Max<=20000, Bwd Packet Length Max>=30000 and Bwd Packet Length Max<=35000.
    • There is an observation at Bwd Packet Length Max>60000. This may indicate outliers in the data.
  10. Bwd Packet Length Mean: Mean length of backward packets

    • The distribution is skewed towards right: Positively skewed.
    • Peak was observed on first bin from left. Peak lies around Bwd Packet Length Mean>=0 and Bwd Packet Length Mean<=666.67.
    • After the peak, there is significant decline in results.
    • Between Bwd Packet Length Mean=0 and Bwd Packet Length Mean=5000, we observed J-shaped graph.
    • There is an observation at Bwd Packet Length Mean=35000. This may indicate outliers in the data.
  11. Bwd Packet Length Std: Standard deviation length of backward packets

    • The distribution is skewed towards right: Positively skewed.
    • Peak was observed on first bin from left. Peak lies around Bwd Packet Length Std>=0 and Bwd Packet Length Std<=416.67.
    • After the peak, there is significant decline in results.
    • There is plateau region observed around Bwd Packet Length Std>=2083 and Bwd Packet Length Std<=2500.
    • There is another plateau region observed (smaller than the above) around Bwd Packet Length Std>=3750 and Bwd Packet Length Std<=4166.
    • There is an observation at Bwd Packet Length Std>20000. This may indicate outliers in the data.
  12. Flow Bytes/s: Flow bytes per second

    • The distribution is skewed towards right: Positively skewed.
    • Peak was observed around Flow Bytes/s=0.
    • After the peak, there is consistent decline in results.
    • Towards right hand side of the graph, there is increase in number of observations compared to other bins prior to it excluding the peak.
    • Between the two extremes of the graph there were some plateau regions.
    • We observed negative values on X-axis, thus, we need to check the actual values under the column to determine if data is accurate or invalid.
  13. Flow Packets/s: Flow packets per second

    • The distribution is skewed towards right: Positively skewed.
    • Peak was observed around Flow Packets/s=0
    • After the peak, there is consistent decline in results.
    • At Flow Packets/s=2 and Flow Packets/s=3, there relatively small peaks.
    • We observed negative values on X-axis, thus, we need to check the actual values under the column to determine if data is accurate or invalid.
  14. Flow IAT Mean: Mean time between flows

    • Peak was observed at Flow IAT Mean=0.
    • Most values are concenterated in bin represented by the peak.
    • We observed negative values on X-axis, thus, we need to check the actual values under the column to determine if data is accurate or invalid.
  15. Flow IAT Std: Standard deviation of time between flows

    • The distribution is skewed towards right: Positively skewed.
    • Peak was observed around Flow IAT Std=0.
    • Most values are concenterated in bin represented by the peak.
    • There are a few observations in the range: - Flow IAT Std>=2 and Flow IAT Std<=3, Flow IAT Std>=3 and Flow IAT Std<=4 and Flow IAT Std>4.
  16. Flow IAT Max: Maximum time between flows

    • Peak was observed around Flow IAT Max=0.
    • We observed negative values on X-axis, thus, we need to check the actual values under the column to determine if data is accurate or invalid.
    • On X-axis values lie in the range -1.0 to +1.0
  17. Flow IAT Min: Minimum time between flows

    • Peak was observed around Flow IAT Min=0.
    • We observed negative values on X-axis, thus, we need to check the actual values under the column to determine if data is accurate or invalid.
  18. Fwd IAT Total: Total time between forward packets

    • Peak was observed around Fwd IAT Total=0.
    • We observed negative values on X-axis, thus, we need to check the actual values under the column to determine if data is accurate or invalid.
  19. Fwd IAT Mean: Mean time between forward packets

    • Peak was observed around Fwd IAT Mean=0.
    • We observed negative values on X-axis, thus, we need to check the actual values under the column to determine if data is accurate or invalid.
  20. Fwd IAT Std: Standard deviation of time between forward packets

    • Peak was observed around Fwd IAT Std=0.
    • There are small number of observations in the range: Fwd IAT Std>=2 and Fwd IAT Std<=3, Fwd IAT Std>=3 and Fwd IAT Std<=4, Fwd IAT Std>4.
  21. Fwd IAT Max: Maximum time between forward packets

    • Peak was observed around Fwd IAT Max=0.
    • We observed negative values on X-axis, thus, we need to check the actual values under the column to determine if data is accurate or invalid.
    • There are scattered but very small number of observations between Fwd IAT Max=0.0 and Fwd IAT Max=1.0
  22. Fwd IAT Min: Minimum time between forward packets

    • Peak was observed around Fed IAT Min=0.
    • We observed negative values on X-axis, thus, we need to check the actual values under the column to determine if data is accurate or invalid.
  23. Bwd IAT Total: Total time between backward packets

    • Peak was observed around Bwd IAT Total=0.
    • After the peak, there is consistent decline in results.
    • There are relatively smaller peaks at Bwd IAT Total=0.6 and Bwd IAT Total=1.125
    • There was a plateau region observed between Bwd IAT Total>=0.625 and Bwd IAT Total<=0.675
  24. Bwd IAT Mean: Mean time between backward packets

    • Peak was observed around Bwd IAT Mean=0.
    • After the peak, there is consistent decline in results.
    • Most observations are stacked on left side of the graph, near the peak.
    • On X-axis values lie in the range 0.0 to +1.2
  25. Bwd IAT Std: Standard deviation of time between packets

    • The distribution is skewed towards right: Positively skewed.
    • Peak was observed around Bwd IAT Std=0
    • After the peak, there is consistent decline in results.
    • There was plateau region observed between Bwd IAT Std>=1.169 and Bwd IAT Std=2
    • As the value of Bwd IAT Std increases, the size of bins decreases. In between there are a few exceptions where size of bin is greater than their neighbors.
  26. Bwd IAT Max: Maximum time between packets

    • The distribution is skewed towards right: Positively skewed.
    • Peak was observed around Bwd IAT Max=0.0
    • After the peak, there is consistent decline in results.
    • There are relatively smaller peaks at Bwd IAT Max=0.125 and Bwd IAT Max=0.575
    • Since there are multiple peaks at significant distance apart, we can also call the graph multi-modal.
    • The bins prior and after all three peaks are very small.
  27. Bwd IAT Min: Minimum time between packets

    • The distribution is skewed towards right: Positively skewed.
    • Peak was observed around Bwd IAT Min=0
    • After the peak, there is significant decline in results.
    • On X-axis values lie in the range 0.0 to 1.2
  28. Fwd PSH Flags: Forward packets with PUSH flags

    • Most of the values are concenterated in the first bin at Fwd PSH Flags=0.0
    • There were few observations at Fwd PSH Flags=1.0. This may indicate outlier in the data.
    • There were no results between Fwd PSH Flags=0.0 and Fwd PSH Flags=1.0
  29. Fwd Header Length: Length of header in forward packets

    • The distribution is skewed towards left: Negatively skewed.
    • Peak was observed around Fwd Header Length=0.0
    • There were no results for Fwd Header Length>0.0
    • There are relatively smaller size bins of left hand side of the peak.
    • On X-axis values lie in the range -2.0 to 0.0
  30. Bwd Header Length: Length of header in backward packets

    • Peak was observed around Bwd Header Length=0.0
    • Most values are concenterated at the peak.
    • There were few observations at Bwd Header Length=-1.75, -1, -0.6, -0.30
    • There were no results for Bwd Header Length>0.0
    • On X-axis values lie in the range -1.75 to 0.0
  31. Fwd Packets/s: Forward packets per second

    • The distribution is skewed towards right: Positively skewed.
    • Peak was observed around Fwd Packets/s=0
    • From Fwd Packets/s=0.0 to 1.5, the values are stacked to the right hand side of peak.
    • There are relatively smaller peaks at Fwd Packets/s= 2.0, 3.0, 4.0
    • There is a wide gap (no results) between Fwd Packets/s=3.0 and Fwd Packets/s=4.0
    • Most values are concenterated between Fwd Packets/s=0.0 and Fwd Packets/s=1.5. Between this range the graph also resembles to J-shaped graph.
  32. Bwd Packets/s: Backward packets per second

    • The distribution is skewed towards right: Positively skewed.
    • Peak was observed around Bwd Packets/s=0.0
    • Most values are concenterated between Bwd Packets.s=0.0 and Bwd Packets/s=0.5. Between this range the graph also resembles to J-shaped graph.
    • There are relatively smaller peaks at Bwd Packets/s=0.5, 1.0 and 2.0
    • After Bwd Packets/s>=1.0, the bins are scattered and gaps were observed at irregular intervals on the x-axis.
  33. Packet Length Max: Maximum length of packets

    • The distribution is skewed towards right: Positively skewed.
    • Peak was observed around Packet Length Max=0
    • After the peak, there is signifcant decline in results between Packet Length Max>=0 and Packet Length Max<=10000.
    • Between Packet Length Max=0 and Packet Length Max=10000, the graph also resembes to J-shaped graph.
    • Between Packet Length Max=10000 to 26000, the results are significantly lower than Packet Length Max=0 to 10000.
    • There were no results observed between Packet Length Max=26000 to 30000, 50000 to 60000.
    • There are some results observed between Packet Length Max=30000 to 50000.
    • There are small number of resuls observed for Packet Length Max>60000. This may indicate outlier in the data.
  34. Packet Length Mean: Mean length of packets

    • On X-axis, values lie in the range 0 to 17500.
    • The distribution is a J-shaped graph.
    • Peak was observed around Packet Length Mean=0
    • All other bins are stacked against the peak on its right hand side.
    • There is a constant decline of results as we move towards right side of the graph.
    • The results are concenterated between Packet Length Mean>=0 and Packet Length Mean<5000.
    • There is a small observation at Packet Length Mean=17500. This may indicate an outlier in the data.
  35. Packet Length Std: Standard deviation length of packets

    • The distribution is a J-shaped graph.
    • Peak was observed around Packet Length Std=0
    • All other bins are stacked against the peak on its right hand side.
    • Most values are concenterated between Packet Length Std>=0 and Packet Length Std<=5000.
    • There is an observation at Packet Length Std>20000. This may indicate an outlier in the data.
    • On X-axis, values lie in the range 0 to 20000.
  36. Packet Length Variance: Variance of length of packets

    • The distribution is skewed towards right: Positively skewed.
    • Peak was observed around Packet Length Variance=0
    • Most values are concenterated between Packet Length Variance>=0 and Packet Length Variance<=1.
    • There is an observation after long gap at Packet Length Variance>5. This may indicate an outlier in the data.
  37. SYN Flag Count: Number of SYN flags

    • Peak was observed at SYN Flag Count=0.
    • Most values are concenterated at the peak.
    • There are a few observations at SYN Flag Count=1.0. This may indicate outlier in the data.
  38. URG Flag Count: Number of URG flags

    • Peak was observed at URG Flag Count=0.
    • Most values are concenterated at the peak.
    • There are a few observations at URG Flag Count=1.0. This may indicate outlier in the data.
  39. Avg Packet Size: Average packet size

    • The distribution is J-shaped graph.
    • Most of the values are stacked at left end and then it continuously declines as we move towards right hand side of the x-axis.
    • Peak was observed at Avg Packet Size=0.
    • Most values are concenterated between Avg Packet Size>=0 and Avg Packet Size<5000.
    • There were some values afer a long gap between Avg Packet Size>5000 and Avg Packet Size <=17500. This may indicate outlier in the data.
  40. Avg Fwd Segment Size: Average forward segment size

    • The distribution is skewed towards right: Positively skewed.
    • Peak was observed at Avg Fwd Segment Size=0.
    • After the peak, there is consistent decline in results.
    • Most values are concenterated between Avg Fwd Segment Size>=0 and Avg Fwd Segment Size<=5000.
    • There were some values around Avg Fwd Sgement Size=7500.
    • There were couple of values observed in range Avg Fwd Segment Size>10000 and Avg Fwd Segment Size<12500, Avg Fwd Segemnt Size>=15000. This may indicate outlier in the data.
  41. Avg Bwd Segment Size: Average backward segment size

    • The distribution is J-shaped graph.
    • Most of the values are stacked at left end and then it continuously declines as we move towards right hand side of the x-axis.
    • Peak was observed at Avg Bwd Segment Size=0.
    • Most values are concenterated between Avg Bwd Segment Size>=0 and Avg Bwd Segment Size<=5000.
    • There is a long gap observed after Avg Bwd Segment Size>5000.
    • On extreme right end side of the graph, between Avg Bwd Segment Size>=30000 and Avg Bwd Segment Size<=35000, few values were observed. This may indicate outlier in the data.
  42. Subflow Fwd Packets: Subflow forward packets

    • Peak was observed at Subflow Fwd Packets=0.
    • After the peak, there is significant decline in results up to Subflow Fwd Packets=50000.
    • There is a plateau region observed between Subflow Fwd Packets>=100000 and Subflow Fwd Packets<=150000.
    • There were decline in the number of results observed after Subflow Fwd Packets>=150000.
    • There are many values between Subflow Fwd Packets>=50000 and Subflow Fwd Packets<=150000.
    • There is a value after Subflow Fwd Packets>300000. This may indicate outlier in the data.
  43. Subflow Fwd Bytes: Subflow forward bytes

    • The distibution is J-shaped graph.
    • Most of the values are stacked at left end and then it continuously declines as we move towards right hand side of the x-axis.
    • Most values are concenterated between Subflow Fwd Bytes>=0 and Subflow Fwd Bytes<0.2
    • There were couple of values observed around Subflow Fwd Bytes=0.4 and Subflow Fwd Bytes>1.4. This may indicate outlier in the data.
  44. Subflow Bwd Packets: Subflow backward packets

    • The distribution is skewed towards right: Positively skewed.
    • Peak was observed at Subflow Bwd Packets=0.
    • After the peak, there is consistent decline in results.
    • Most values are concenterated between Subflow Bwd Packets>=0 and Subflow Bwd Packets<=50000.
    • After Subflow Bwd Packets>50000, there are many small plateau regions at irregular gaps up to Subflow Bwd Packets<300000.
  45. Subflow Bwd Bytes: Subflow backward bytes

    • The distribution is skewed towards right: Positively skewed.
    • Peak was observed at Subflow Bwd Bytes=0.
    • Between Subflow Bwd Bytes>=0 and Subflow Bwd Bytes<=1, the graph appeared similar to J-shaped graph.
    • Most values are concenterated between Subflow Bwd Bytes>=0 and Subflow Bwd Bytes<=1.
    • There are some plateau regions on right hand side of the peak at irregular gaps.
    • On the X-axis values lie in the range 0 to 7.
  46. Init Fwd Win Bytes: Initial forward window size

    • There are two large peaks at Init Fwd Win Bytes=0 and Init Fwd Win Bytes=10000.
    • There are smaller peaks at Init Fwd Win Bytes=30000 and Init Fwd Win Bytes>60000.
    • Between the peaks, the frequency of bins is relatively very less.
    • There are no gaps in the results observed on X-axis of the graph.
    • Since the graph has multiple peaks, we can also call it multi-modal.
    • From broad overview, as we move from left to right hand side of the graph, the results decrease. But, due to tall peaks observed in between, we cannot conclude consistent decline of results.
  47. Init Bwd Win Bytes: Initial backward window size

    • There are three main peaks from overall observation of the graph: Init Bwd Win Bytes=0, 30000, 60000.
    • The tallest peak was observed at Init Bwd Win Bytes=0, the second tallest was at Init Bwd Win Bytes=60000 and the smallest peak among the three was observed at Init Bwd Win Bytes=30000.
    • Between the peaks, the frequency of bins is relatively very less.
    • There are no gaps in the results observed on X-axis of the graph.
    • Since the graph has multiple peaks, we can also call it multi-modal.
  48. Fwd Act Data Packets: Forward packets with actual data

    • Peak was observed at Fwd Act Data Packets=0.
    • After the peak, there is significant decline in results up to Fwd Act Data Packets=50000.
    • There is a plateau region observed between Fwd Act Data Packets>=100000 and Fwd Act Data Packets<=150000.
    • There are some values observed after Fwd Act Data Packets>300000. This may indicate outlier in the data.
  49. Fwd Seg Size Min: Minimum segment size in forward packets

    • Peak was observed at Fwd Seg Size Min=0.0
    • On the X-axis value lie in the range -1.4 to 0.0. Thus, the values on X-axis are all negative, we need to check the actual values under the column to determine if data is accurate or invalid.
    • There are some values observed at Fwd Seg Size Min=-1.4, Fwd Seg Size Min>-1.2 and Fwd Seg Size Min<-1.0, Fwd Seg Size Min>-0.6 and Fwd Seg Size Min<-0.4
  50. Active Mean: Mean active time

    • The distribution is skewed towards right: Positively skewed.
    • Peak was observed at Active Mean=0.
    • After the peak, there is significant decline in results.
    • There are two plateau regions observed at Active Mean=0.4 and Active Mean=0.6
    • There are no gaps in the results observed on X-axis of the graph
  51. Active Std: Standard deviation of active time

    • The distribution is skewed towards right: Positively skewed.
    • Peak was observed at Active Std=0.
    • After the peak, there is consistent decline in results up to Active Std=3.
    • There are two plateau regions observed between Active Std>=3 and Active Std<=4.
    • There is decline in the results between Active Std>=4 and Active Std<=5,
    • There is second plateau in the graph observed near Active Std=5.
    • There is decline in the results after Active Std>5.
  52. Active Max: Maximum active time

    • The distribution is skewed towards right: Positively skewed.
    • Peak was observed at Active Max=0.
    • After the peak, there is consistent decline in results up to Active Max=0.6
    • Around Active Max=0.6, there is a relatively smaller peak compared to main peak, and a plateau region of 2 bins around it.
    • Similarly, around Active Max=0.8, there is a relatively smaller peak compared to main peak, and a plateay region of 2 bins around it.
    • On X-axis values lie in the range 0 to 1.2
    • There are no gaps in the results observed on X-axis of the graph.
  53. Active Min: Minimum active time

    • The distibution is skewed towards right: Positively skewed.
    • Peak was observed at Active Min=0.
    • There is relatively smaller peak at Active Min=0.8 and a plateau region around it.
    • On X-axis value lie in the range 0 to 1.2
    • There are no gaps in the results observed on X-axis of the graph.
  54. Idle Mean: Mean idle time

    • Peak was observed at Idle Mean=0.
    • Most values are concenterated in bin represented by the peak.
    • There are some values observed at Idle Mean=2.0, 3.0, 3.5, 4.0
    • There are large gaps observed on X-axis of the graph after the peak.
  55. Idle Std: Standard deviation of idle time

    • Peak was observed at Idle Std=0.0
    • Most values are concenterated in bin represented by the peak.
    • There are some values observed at Idle Std=1.0, 1.5, 2.0 and 2.5
    • There are large gaps observed on X-axis of the graph after the peak.
  56. Idle Max: Maximum idle time

    • Peak was observed at Idle Max=0.0
    • Most values are concenterated in bin represented by the peak.
    • There are some values observed at Idle Max=0.4, 0.6, 0.8, 1.0.
    • There are large gaps observed on X-axis of the graph after the peak.
  57. Idle Min: Minimum idle time

    • Peak was observed at Idle Min=0.0
    • Most values are concenterated in bin represented by the peak.
    • There is a value observed after at Idle Min=2.5, which is after a large gap on X-axis. This may indicate outlier in the data.
In [20]:
# Plotting with normal scale (horizontal bar chart)
plt.figure(figsize=(12, 12))
cic_df['Label'].value_counts().plot(kind='barh')
plt.title('Label Distribution (Normal Scale)')
plt.show()

# Plotting with log scale (horizontal bar chart)
plt.figure(figsize=(12, 12))
cic_df['Label'].value_counts().plot(kind='barh', log=True)
plt.title('Label Distribution (Log Scale)')
plt.show()
No description has been provided for this image
No description has been provided for this image
In [21]:
# Plotting with normal scale
plt.figure(figsize=(12, 6))
cic_df['ClassLabel'].value_counts().plot(kind='bar')
plt.title('ClassLabel Distribution (Normal Scale)')
plt.xticks(rotation=0)
plt.show()

# Plotting with log scale
plt.figure(figsize=(12, 6))
cic_df['ClassLabel'].value_counts().plot(kind='bar', log=True)
plt.title('ClassLabel Distribution (Log Scale)')
plt.xticks(rotation=0)
plt.show()
No description has been provided for this image
No description has been provided for this image
In [22]:
#Summarizing the data
cic_df.describe(include='all').transpose()
Out[22]:
count unique top freq mean std min 25% 50% 75% max
Flow Duration 9167271.0 NaN NaN NaN 15907223.442477 656982607.269506 -919011000000.0 11604.0 396803.0 5562536.0 120000000.0
Total Fwd Packets 9167271.0 NaN NaN NaN 40.796369 2066.318093 0.0 2.0 3.0 7.0 309629.0
Total Backward Packets 9167271.0 NaN NaN NaN 9.505533 580.575061 0.0 1.0 2.0 5.0 291922.0
Fwd Packets Length Total 9167271.0 NaN NaN NaN 2063.895115 83587.211939 0.0 29.0 97.0 935.0 144391846.0
Bwd Packets Length Total 9167271.0 NaN NaN NaN 10011.181786 1281318.910198 0.0 0.0 232.0 964.0 655453030.0
Fwd Packet Length Max 9167271.0 NaN NaN NaN 294.705839 501.859251 0.0 20.0 55.0 507.0 64440.0
Fwd Packet Length Mean 9167271.0 NaN NaN NaN 81.27935 142.242706 0.0 7.0 44.0 107.666664 16529.314453
Fwd Packet Length Std 9167271.0 NaN NaN NaN 104.328316 198.990021 0.0 0.0 11.547006 180.710632 18401.582031
Bwd Packet Length Max 9167271.0 NaN NaN NaN 607.085739 1180.5395 0.0 0.0 152.0 964.0 65160.0
Bwd Packet Length Mean 9167271.0 NaN NaN NaN 200.295654 379.298279 0.0 0.0 108.0 216.375 33879.285156
Bwd Packet Length Std 9167271.0 NaN NaN NaN 240.994995 504.050537 0.0 0.0 0.0 405.464783 21326.238281
Flow Bytes/s 9167271.0 NaN NaN NaN 2854904.490479 63544921.718519 -261000000.0 55.596724 993.774576 27169.700651 2944000000.0
Flow Packets/s 9167271.0 NaN NaN NaN 10998.516442 103814.481722 -2000000.0 1.465227 16.189285 497.945973 4000000.0
Flow IAT Mean 9167271.0 NaN NaN NaN 4577766.0 296521888.0 -828219981824.0 2579.5 82560.335938 788482.8125 120000000.0
Flow IAT Std 9167271.0 NaN NaN NaN 2389986.25 449936384.0 0.0 0.0 18573.603516 836723.1875 474354483200.0
Flow IAT Max 9167271.0 NaN NaN NaN 10513515.967775 877591030.44273 -828220000000.0 10593.0 223837.0 5109311.5 979781000000.0
Flow IAT Min 9167271.0 NaN NaN NaN 2752844.970628 994810251.425043 -947405000000.0 3.0 14.0 470.0 120000000.0
Fwd IAT Total 9167271.0 NaN NaN NaN 15304688.663146 656974431.36816 -919011000000.0 283.0 71923.0 4712353.0 120000000.0
Fwd IAT Mean 9167271.0 NaN NaN NaN 5135515.5 296550368.0 -828219981824.0 135.0 28517.800781 1074626.625 120000000.0
Fwd IAT Std 9167271.0 NaN NaN NaN 2542754.0 449947776.0 0.0 0.0 454.427399 399539.28125 474354483200.0
Fwd IAT Max 9167271.0 NaN NaN NaN 10060776.647337 877585868.180635 -828220000000.0 204.0 61619.0 4226764.0 979781000000.0
Fwd IAT Min 9167271.0 NaN NaN NaN 3002675.727242 994818677.362318 -947405000000.0 2.0 36.0 455.0 120000000.0
Bwd IAT Total 9167271.0 NaN NaN NaN 9415138.051235 28114788.410477 0.0 0.0 731.0 1252558.5 120000000.0
Bwd IAT Mean 9167271.0 NaN NaN NaN 1223600.375 6206843.5 0.0 0.0 646.0 263630.15625 120000000.0
Bwd IAT Std 9167271.0 NaN NaN NaN 1224655.75 4738748.5 0.0 0.0 0.0 282084.390625 84835320.0
Bwd IAT Max 9167271.0 NaN NaN NaN 3654958.646934 13277469.581051 0.0 0.0 708.0 953075.0 120000000.0
Bwd IAT Min 9167271.0 NaN NaN NaN 482945.861238 5587847.968851 0.0 0.0 3.0 305.0 120000000.0
Fwd PSH Flags 9167271.0 NaN NaN NaN 0.031361 0.174291 0.0 0.0 0.0 0.0 1.0
Fwd Header Length 9167271.0 NaN NaN NaN -8642949.045872 926438618.023802 -212543795000.0 40.0 72.0 168.0 134480904.0
Bwd Header Length 9167271.0 NaN NaN NaN -54198.682322 12840228.879468 -17003494240.0 8.0 60.0 136.0 5838440.0
Fwd Packets/s 9167271.0 NaN NaN NaN 9276.251953 99356.585938 0.0 0.875042 8.615973 269.56601 4000000.0
Bwd Packets/s 9167271.0 NaN NaN NaN 1739.180176 18565.78125 0.0 0.140176 3.323374 77.047539 2000000.0
Packet Length Max 9167271.0 NaN NaN NaN 708.834548 1220.914561 0.0 46.0 232.0 964.0 65160.0
Packet Length Mean 9167271.0 NaN NaN NaN 142.053604 209.506439 0.0 30.75 78.666664 155.375 17344.984375
Packet Length Std 9167271.0 NaN NaN NaN 220.316681 383.539764 0.0 8.763561 73.900833 319.470306 22788.287109
Packet Length Variance 9167271.0 NaN NaN NaN 195633.8125 957366.4375 0.0 76.800003 5461.333496 102061.273438 519000000.0
SYN Flag Count 9167271.0 NaN NaN NaN 0.04044 0.19699 0.0 0.0 0.0 0.0 1.0
URG Flag Count 9167271.0 NaN NaN NaN 0.036218 0.186833 0.0 0.0 0.0 0.0 1.0
Avg Packet Size 9167271.0 NaN NaN NaN 159.688553 229.775696 0.0 41.0 99.5 174.0 17478.408203
Avg Fwd Segment Size 9167271.0 NaN NaN NaN 81.27935 142.242706 0.0 7.0 44.0 107.666664 16529.314453
Avg Bwd Segment Size 9167271.0 NaN NaN NaN 200.295654 379.298279 0.0 0.0 108.0 216.375 33879.285156
Subflow Fwd Packets 9167271.0 NaN NaN NaN 40.796369 2066.318093 0.0 2.0 3.0 7.0 309629.0
Subflow Fwd Bytes 9167271.0 NaN NaN NaN 2063.891879 83586.713236 0.0 29.0 97.0 935.0 144391846.0
Subflow Bwd Packets 9167271.0 NaN NaN NaN 9.505533 580.575061 0.0 1.0 2.0 5.0 291922.0
Subflow Bwd Bytes 9167271.0 NaN NaN NaN 10011.036974 1281298.795172 0.0 0.0 232.0 964.0 655453030.0
Init Fwd Win Bytes 9167271.0 NaN NaN NaN 10559.004783 18924.951881 -1.0 -1.0 2049.0 8192.0 65535.0
Init Bwd Win Bytes 9167271.0 NaN NaN NaN 8373.292812 19433.923807 -1.0 -1.0 123.0 259.0 65535.0
Fwd Act Data Packets 9167271.0 NaN NaN NaN 36.34433 2053.127215 0.0 0.0 1.0 4.0 309628.0
Fwd Seg Size Min 9167271.0 NaN NaN NaN -1071545.079784 33726253.943877 -1408237563.0 20.0 20.0 20.0 67240452.0
Active Mean 9167271.0 NaN NaN NaN 116771.984375 1476716.125 0.0 0.0 0.0 0.0 114000000.0
Active Std 9167271.0 NaN NaN NaN 55209.441406 854685.3125 0.0 0.0 0.0 0.0 74953352.0
Active Max 9167271.0 NaN NaN NaN 193040.412274 1981967.859903 0.0 0.0 0.0 0.0 114000000.0
Active Min 9167271.0 NaN NaN NaN 82811.023526 1272059.803775 0.0 0.0 0.0 0.0 114000000.0
Idle Mean 9167271.0 NaN NaN NaN 8015579.0 350376032.0 0.0 0.0 0.0 0.0 395571429376.0
Idle Std 9167271.0 NaN NaN NaN 549194.4375 225147856.0 0.0 0.0 0.0 0.0 262247858176.0
Idle Max 9167271.0 NaN NaN NaN 8775394.133697 832169632.699721 0.0 0.0 0.0 0.0 979781000000.0
Idle Min 9167271.0 NaN NaN NaN 7413830.334114 84632298.040205 0.0 0.0 0.0 0.0 239934000000.0
Label 9167271 33 Benign 7185881 NaN NaN NaN NaN NaN NaN NaN
ClassLabel 9167271 8 Benign 7185881 NaN NaN NaN NaN NaN NaN NaN
In [23]:
#Computing proportion of negative values among the features where the negative values were observed
negative_proportion=[]
features_with_negative_values=["Flow Duration","Flow Bytes/s","Flow Packets/s","Flow IAT Mean","Flow IAT Max","Flow IAT Min","Fwd IAT Total",
                               "Fwd IAT Mean","Fwd IAT Max","Fwd IAT Min","Fwd Header Length","Bwd Header Length","Init Fwd Win Bytes", 
                               "Init Bwd Win Bytes", "Fwd Seg Size Min"]
for feature in features_with_negative_values:
    negative_count=(cic_df[feature]<0).sum()
    negative_Proportion=negative_count*100/9167271
    negative_proportion.append((feature, negative_count, negative_Proportion))

negative_proportion_df=pd.DataFrame(negative_proportion, columns=["Feature name","Number of negative values", "Percentage of negative values"])
print(negative_proportion_df)
          Feature name  Number of negative values  \
0        Flow Duration                         96   
1         Flow Bytes/s                         53   
2       Flow Packets/s                         96   
3        Flow IAT Mean                         96   
4         Flow IAT Max                         85   
5         Flow IAT Min                       2816   
6        Fwd IAT Total                         14   
7         Fwd IAT Mean                         14   
8          Fwd IAT Max                          3   
9          Fwd IAT Min                         32   
10   Fwd Header Length                      50907   
11   Bwd Header Length                        257   
12  Init Fwd Win Bytes                    2658779   
13  Init Bwd Win Bytes                    3766283   
14    Fwd Seg Size Min                      74142   

    Percentage of negative values  
0                        0.001047  
1                        0.000578  
2                        0.001047  
3                        0.001047  
4                        0.000927  
5                        0.030718  
6                        0.000153  
7                        0.000153  
8                        0.000033  
9                        0.000349  
10                       0.555312  
11                       0.002803  
12                      29.002950  
13                      41.084015  
14                       0.808768  
In [24]:
#Fetching statistical summary for features with negative values
cic_df[features_with_negative_values].describe().transpose()
Out[24]:
count mean std min 25% 50% 75% max
Flow Duration 9167271.0 1.590722e+07 6.569826e+08 -9.190110e+11 11604.000000 396803.000000 5.562536e+06 1.200000e+08
Flow Bytes/s 9167271.0 2.854904e+06 6.354492e+07 -2.610000e+08 55.596724 993.774576 2.716970e+04 2.944000e+09
Flow Packets/s 9167271.0 1.099852e+04 1.038145e+05 -2.000000e+06 1.465227 16.189285 4.979460e+02 4.000000e+06
Flow IAT Mean 9167271.0 4.577766e+06 2.965219e+08 -8.282200e+11 2579.500000 82560.335938 7.884828e+05 1.200000e+08
Flow IAT Max 9167271.0 1.051352e+07 8.775910e+08 -8.282200e+11 10593.000000 223837.000000 5.109312e+06 9.797810e+11
Flow IAT Min 9167271.0 2.752845e+06 9.948103e+08 -9.474050e+11 3.000000 14.000000 4.700000e+02 1.200000e+08
Fwd IAT Total 9167271.0 1.530469e+07 6.569744e+08 -9.190110e+11 283.000000 71923.000000 4.712353e+06 1.200000e+08
Fwd IAT Mean 9167271.0 5.135516e+06 2.965504e+08 -8.282200e+11 135.000000 28517.800781 1.074627e+06 1.200000e+08
Fwd IAT Max 9167271.0 1.006078e+07 8.775859e+08 -8.282200e+11 204.000000 61619.000000 4.226764e+06 9.797810e+11
Fwd IAT Min 9167271.0 3.002676e+06 9.948187e+08 -9.474050e+11 2.000000 36.000000 4.550000e+02 1.200000e+08
Fwd Header Length 9167271.0 -8.642949e+06 9.264386e+08 -2.125438e+11 40.000000 72.000000 1.680000e+02 1.344809e+08
Bwd Header Length 9167271.0 -5.419868e+04 1.284023e+07 -1.700349e+10 8.000000 60.000000 1.360000e+02 5.838440e+06
Init Fwd Win Bytes 9167271.0 1.055900e+04 1.892495e+04 -1.000000e+00 -1.000000 2049.000000 8.192000e+03 6.553500e+04
Init Bwd Win Bytes 9167271.0 8.373293e+03 1.943392e+04 -1.000000e+00 -1.000000 123.000000 2.590000e+02 6.553500e+04
Fwd Seg Size Min 9167271.0 -1.071545e+06 3.372625e+07 -1.408238e+09 20.000000 20.000000 2.000000e+01 6.724045e+07
In [25]:
#Removing Init Fwd Win Bytes from the list of negative values
features_with_negative_values.remove("Init Fwd Win Bytes")
features_with_negative_values.remove("Init Bwd Win Bytes")
print("List of updated list of features with negative values: ",features_with_negative_values)
List of updated list of features with negative values:  ['Flow Duration', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Max', 'Fwd IAT Min', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Seg Size Min']
In [26]:
#Imputing all negative values with their feature's respective median except for Init Fwd Win Bytes, Init Bwd Win Bytes
for c in features_with_negative_values: 
    median_value = cic_df[c][cic_df[c] >= 0].median() 
    cic_df[c] = cic_df[c].apply(lambda x: median_value if x < 0 else x)
In [27]:
#Fetching statistical summary after performing imputation
cic_df[features_with_negative_values].describe().transpose()
Out[27]:
count mean std min 25% 50% 75% max
Flow Duration 9167271.0 1.657217e+07 3.407960e+07 1.000000 11605.000000 396839.000000 5.562536e+06 1.200000e+08
Flow Bytes/s 9167271.0 2.855014e+06 6.354482e+07 0.000000 55.599289 993.799601 2.716970e+04 2.944000e+09
Flow Packets/s 9167271.0 1.101542e+04 1.036516e+05 0.016667 1.465322 16.190865 4.979460e+02 4.000000e+06
Flow IAT Mean 9167271.0 4.717615e+06 1.564841e+07 0.333333 2580.000000 82566.500000 7.884828e+05 1.200000e+08
Flow IAT Max 9167271.0 1.062255e+07 8.321956e+08 1.000000 10595.000000 223853.000000 5.109312e+06 9.797810e+11
Flow IAT Min 9167271.0 3.915124e+06 1.560000e+07 0.000000 3.000000 14.000000 4.700000e+02 1.200000e+08
Fwd IAT Total 9167271.0 1.596963e+07 3.393343e+07 0.000000 283.000000 71924.000000 4.712353e+06 1.200000e+08
Fwd IAT Mean 9167271.0 5.275366e+06 1.617462e+07 0.000000 135.000000 28518.000000 1.074627e+06 1.200000e+08
Fwd IAT Max 9167271.0 1.016981e+07 8.321902e+08 0.000000 204.000000 61619.000000 4.226764e+06 9.797810e+11
Fwd IAT Min 9167271.0 4.164954e+06 1.611036e+07 0.000000 2.000000 36.000000 4.550000e+02 1.200000e+08
Fwd Header Length 9167271.0 4.675070e+02 4.810506e+04 0.000000 40.000000 72.000000 1.680000e+02 1.344809e+08
Bwd Header Length 9167271.0 2.119982e+02 1.162439e+04 0.000000 8.000000 60.000000 1.360000e+02 5.838440e+06
Fwd Seg Size Min 9167271.0 2.814285e+01 2.220807e+04 0.000000 20.000000 20.000000 2.000000e+01 6.724045e+07

We have successfully handled negative values among 13 features.

We have two more features with substantial number of negative values: -

  • Init Fwd Win Bytes : 29% values are negative
  • Init Bwd Win Bytes : 41% values are negative

If we drop the rows with negative values among the two columns, we will lose massive volume of information from other features in the dataset.

If we drop the two columns, we will lose out the data from those columns having positive values and which may be important.

In [28]:
#Fetching the count of target feature Label and ClassLabel when Init Fwd Win Bytes have negative values
negative_init_fwd_win_bytes=cic_df[cic_df['Init Fwd Win Bytes']<0]
print("\nLabel: \n",negative_init_fwd_win_bytes['Label'].value_counts())
print("\nClassLabel: \n",negative_init_fwd_win_bytes['ClassLabel'].value_counts())
Label: 
 Label
Benign                  2340737
DDoS-NTP                 120966
DDoS-TFTP                 98611
Infiltration              31750
DDoS-UDP                  28855
DDoS-MSSQL                11779
DDoS-Ddossim               5115
DDoS-DNS                   3662
DoS-Slowread               2786
DDoS-LDAP                  2085
DDoS-SNMP                  2013
DDoS-UDPLag                1985
DDoS-Slowloris             1858
DoS-Slowheaders            1649
DDoS-LOIC-HTTP              797
DoS-Hulk                    747
DoS-Rudy                    699
DDoS-NetBIOS                668
DoS-Goldeneye               632
DoS-Slowbody                621
Portscan                    290
Botnet                      258
Webattack-bruteforce        145
DDoS-Syn                     67
Webattack-XSS                 4
Name: count, dtype: int64

ClassLabel: 
 ClassLabel
Benign          2340737
DDoS             278461
Infiltration      31750
DoS                7134
Portscan            290
Botnet              258
Webattack           149
Name: count, dtype: int64
In [29]:
#Fetching the count of target feature Label and ClassLabel when Init Bwd Win Bytes have negative values
negative_init_fwd_win_bytes=cic_df[cic_df['Init Bwd Win Bytes']<0]
print("\nLabel: \n",negative_init_fwd_win_bytes['Label'].value_counts())
print("\nClassLabel: \n",negative_init_fwd_win_bytes['ClassLabel'].value_counts())
Label: 
 Label
Benign                  2916889
DDoS-LOIC-HTTP           286075
DoS-Hulk                 128830
DDoS-NTP                 120979
DDoS-TFTP                 98678
DDoS                      46524
Infiltration              42377
DDoS-HOIC                 35111
DDoS-UDP                  28855
DoS-Goldeneye             16206
DDoS-MSSQL                11780
DDoS-Syn                   9716
DDoS-UDPLag                6436
DoS-Slowloris              3873
DDoS-DNS                   3662
DoS-Slowhttptest           3145
DDoS-LDAP                  2086
DDoS-SNMP                  2013
Botnet                     1520
DDoS-NetBIOS                668
Portscan                    352
Webattack-bruteforce        280
Webattack-XSS               128
Bruteforce-FTP               63
Webattack-SQLi               20
Bruteforce-SSH               17
Name: count, dtype: int64

ClassLabel: 
 ClassLabel
Benign          2916889
DDoS             652583
DoS              152054
Infiltration      42377
Botnet             1520
Webattack           428
Portscan            352
Bruteforce           80
Name: count, dtype: int64
  • At present, the dataset is massively imbalanced with 78% records classifified as Benign and 22% records classifified as Malicious.
  • Based on the results obtained above by fetching the counts for "negative values" of the two features we observed: -
    • Init Fwd Win Bytes: 88% records are Benign and 12% records are Malicious.
    • Init Bwd Win Bytes: 78% records are Benign and 22% records are Malicious.
  • Thus, the negative values for the two features do not give any different characteristic of events when compared with characteristics of the complete dataset.
  • Moreover, both features by definition cannot have negative values. Thus, it indicates data quality issues in those records.
  • We can perform prediction of data for the two features having negative values, however, due to constraint of time we will not adopt that approach.
  • As the result, we shall perform imputation using the respective median values.
In [30]:
for c in ['Init Fwd Win Bytes','Init Bwd Win Bytes']: 
    median_value = cic_df[c][cic_df[c] >= 0].median() 
    cic_df[c] = cic_df[c].apply(lambda x: median_value if x < 0 else x)
In [31]:
#Fetching statistical summary after performing imputation
cic_df.describe(include='all').transpose()
Out[31]:
count unique top freq mean std min 25% 50% 75% max
Flow Duration 9167271.0 NaN NaN NaN 16572166.706035 34079602.158443 1.0 11605.0 396839.0 5562536.0 120000000.0
Total Fwd Packets 9167271.0 NaN NaN NaN 40.796369 2066.318093 0.0 2.0 3.0 7.0 309629.0
Total Backward Packets 9167271.0 NaN NaN NaN 9.505533 580.575061 0.0 1.0 2.0 5.0 291922.0
Fwd Packets Length Total 9167271.0 NaN NaN NaN 2063.895115 83587.211939 0.0 29.0 97.0 935.0 144391846.0
Bwd Packets Length Total 9167271.0 NaN NaN NaN 10011.181786 1281318.910198 0.0 0.0 232.0 964.0 655453030.0
Fwd Packet Length Max 9167271.0 NaN NaN NaN 294.705839 501.859251 0.0 20.0 55.0 507.0 64440.0
Fwd Packet Length Mean 9167271.0 NaN NaN NaN 81.27935 142.242706 0.0 7.0 44.0 107.666664 16529.314453
Fwd Packet Length Std 9167271.0 NaN NaN NaN 104.328316 198.990021 0.0 0.0 11.547006 180.710632 18401.582031
Bwd Packet Length Max 9167271.0 NaN NaN NaN 607.085739 1180.5395 0.0 0.0 152.0 964.0 65160.0
Bwd Packet Length Mean 9167271.0 NaN NaN NaN 200.295654 379.298279 0.0 0.0 108.0 216.375 33879.285156
Bwd Packet Length Std 9167271.0 NaN NaN NaN 240.994995 504.050537 0.0 0.0 0.0 405.464783 21326.238281
Flow Bytes/s 9167271.0 NaN NaN NaN 2855014.466597 63544820.766049 0.0 55.599289 993.799601 27169.700651 2944000000.0
Flow Packets/s 9167271.0 NaN NaN NaN 11015.42319 103651.622741 0.016667 1.465322 16.190865 497.945973 4000000.0
Flow IAT Mean 9167271.0 NaN NaN NaN 4717614.943425 15648408.216738 0.333333 2580.0 82566.5 788482.8125 120000000.0
Flow IAT Std 9167271.0 NaN NaN NaN 2389986.25 449936384.0 0.0 0.0 18573.603516 836723.1875 474354483200.0
Flow IAT Max 9167271.0 NaN NaN NaN 10622550.600614 832195631.034822 1.0 10595.0 223853.0 5109311.5 979781000000.0
Flow IAT Min 9167271.0 NaN NaN NaN 3915123.586074 15599995.873778 0.0 3.0 14.0 470.0 120000000.0
Fwd IAT Total 9167271.0 NaN NaN NaN 15969627.880819 33933433.81845 0.0 283.0 71924.0 4712353.0 120000000.0
Fwd IAT Mean 9167271.0 NaN NaN NaN 5275366.172166 16174616.572548 0.0 135.0 28518.0 1074626.625 120000000.0
Fwd IAT Std 9167271.0 NaN NaN NaN 2542754.0 449947776.0 0.0 0.0 454.427399 399539.28125 474354483200.0
Fwd IAT Max 9167271.0 NaN NaN NaN 10169809.224737 832190246.518281 0.0 204.0 61619.0 4226764.0 979781000000.0
Fwd IAT Min 9167271.0 NaN NaN NaN 4164954.338113 16110357.348586 0.0 2.0 36.0 455.0 120000000.0
Bwd IAT Total 9167271.0 NaN NaN NaN 9415138.051235 28114788.410477 0.0 0.0 731.0 1252558.5 120000000.0
Bwd IAT Mean 9167271.0 NaN NaN NaN 1223600.375 6206843.5 0.0 0.0 646.0 263630.15625 120000000.0
Bwd IAT Std 9167271.0 NaN NaN NaN 1224655.75 4738748.5 0.0 0.0 0.0 282084.390625 84835320.0
Bwd IAT Max 9167271.0 NaN NaN NaN 3654958.646934 13277469.581051 0.0 0.0 708.0 953075.0 120000000.0
Bwd IAT Min 9167271.0 NaN NaN NaN 482945.861238 5587847.968851 0.0 0.0 3.0 305.0 120000000.0
Fwd PSH Flags 9167271.0 NaN NaN NaN 0.031361 0.174291 0.0 0.0 0.0 0.0 1.0
Fwd Header Length 9167271.0 NaN NaN NaN 467.506976 48105.058152 0.0 40.0 72.0 168.0 134480904.0
Bwd Header Length 9167271.0 NaN NaN NaN 211.998229 11624.394465 0.0 8.0 60.0 136.0 5838440.0
Fwd Packets/s 9167271.0 NaN NaN NaN 9276.251953 99356.585938 0.0 0.875042 8.615973 269.56601 4000000.0
Bwd Packets/s 9167271.0 NaN NaN NaN 1739.180176 18565.78125 0.0 0.140176 3.323374 77.047539 2000000.0
Packet Length Max 9167271.0 NaN NaN NaN 708.834548 1220.914561 0.0 46.0 232.0 964.0 65160.0
Packet Length Mean 9167271.0 NaN NaN NaN 142.053604 209.506439 0.0 30.75 78.666664 155.375 17344.984375
Packet Length Std 9167271.0 NaN NaN NaN 220.316681 383.539764 0.0 8.763561 73.900833 319.470306 22788.287109
Packet Length Variance 9167271.0 NaN NaN NaN 195633.8125 957366.4375 0.0 76.800003 5461.333496 102061.273438 519000000.0
SYN Flag Count 9167271.0 NaN NaN NaN 0.04044 0.19699 0.0 0.0 0.0 0.0 1.0
URG Flag Count 9167271.0 NaN NaN NaN 0.036218 0.186833 0.0 0.0 0.0 0.0 1.0
Avg Packet Size 9167271.0 NaN NaN NaN 159.688553 229.775696 0.0 41.0 99.5 174.0 17478.408203
Avg Fwd Segment Size 9167271.0 NaN NaN NaN 81.27935 142.242706 0.0 7.0 44.0 107.666664 16529.314453
Avg Bwd Segment Size 9167271.0 NaN NaN NaN 200.295654 379.298279 0.0 0.0 108.0 216.375 33879.285156
Subflow Fwd Packets 9167271.0 NaN NaN NaN 40.796369 2066.318093 0.0 2.0 3.0 7.0 309629.0
Subflow Fwd Bytes 9167271.0 NaN NaN NaN 2063.891879 83586.713236 0.0 29.0 97.0 935.0 144391846.0
Subflow Bwd Packets 9167271.0 NaN NaN NaN 9.505533 580.575061 0.0 1.0 2.0 5.0 291922.0
Subflow Bwd Bytes 9167271.0 NaN NaN NaN 10011.036974 1281298.795172 0.0 0.0 232.0 964.0 655453030.0
Init Fwd Win Bytes 9167271.0 NaN NaN NaN 12935.216454 17938.509047 0.0 8192.0 8192.0 8192.0 65535.0
Init Bwd Win Bytes 9167271.0 NaN NaN NaN 8470.251087 19392.445994 0.0 219.0 235.0 259.0 65535.0
Fwd Act Data Packets 9167271.0 NaN NaN NaN 36.34433 2053.127215 0.0 0.0 1.0 4.0 309628.0
Fwd Seg Size Min 9167271.0 NaN NaN NaN 28.142852 22208.067019 0.0 20.0 20.0 20.0 67240452.0
Active Mean 9167271.0 NaN NaN NaN 116771.984375 1476716.125 0.0 0.0 0.0 0.0 114000000.0
Active Std 9167271.0 NaN NaN NaN 55209.441406 854685.3125 0.0 0.0 0.0 0.0 74953352.0
Active Max 9167271.0 NaN NaN NaN 193040.412274 1981967.859903 0.0 0.0 0.0 0.0 114000000.0
Active Min 9167271.0 NaN NaN NaN 82811.023526 1272059.803775 0.0 0.0 0.0 0.0 114000000.0
Idle Mean 9167271.0 NaN NaN NaN 8015579.0 350376032.0 0.0 0.0 0.0 0.0 395571429376.0
Idle Std 9167271.0 NaN NaN NaN 549194.4375 225147856.0 0.0 0.0 0.0 0.0 262247858176.0
Idle Max 9167271.0 NaN NaN NaN 8775394.133697 832169632.699721 0.0 0.0 0.0 0.0 979781000000.0
Idle Min 9167271.0 NaN NaN NaN 7413830.334114 84632298.040205 0.0 0.0 0.0 0.0 239934000000.0
Label 9167271 33 Benign 7185881 NaN NaN NaN NaN NaN NaN NaN
ClassLabel 9167271 8 Benign 7185881 NaN NaN NaN NaN NaN NaN NaN
In [32]:
#Creating a new feature: isMalicious : Yes=1 , No=0
cic_df['isMalicious']=np.where(cic_df['ClassLabel']!='Benign', 1, 0)
cic_df.head(10)
Out[32]:
Flow Duration Total Fwd Packets Total Backward Packets Fwd Packets Length Total Bwd Packets Length Total Fwd Packet Length Max Fwd Packet Length Mean Fwd Packet Length Std Bwd Packet Length Max Bwd Packet Length Mean ... Active Std Active Max Active Min Idle Mean Idle Std Idle Max Idle Min Label ClassLabel isMalicious
0 4.0 2 0 12.0 0.0 6.0 6.000000 0.000000 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign Benign 0
1 1.0 2 0 12.0 0.0 6.0 6.000000 0.000000 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign Benign 0
2 3.0 2 0 12.0 0.0 6.0 6.000000 0.000000 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign Benign 0
3 1.0 2 0 12.0 0.0 6.0 6.000000 0.000000 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign Benign 0
4 609.0 7 4 484.0 414.0 233.0 69.142860 111.967896 207.0 103.5 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign Benign 0
5 879.0 9 4 656.0 3064.0 313.0 72.888885 136.153809 1532.0 766.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign Benign 0
6 1160.0 9 6 3134.0 3048.0 1552.0 348.222229 682.482544 1518.0 508.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign Benign 0
7 524.0 7 4 2812.0 2820.0 1397.0 401.714294 679.914856 1410.0 705.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign Benign 0
8 6.0 1 1 6.0 6.0 6.0 6.000000 0.000000 6.0 6.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign Benign 0
9 1119.0 9 6 3160.0 3060.0 1565.0 351.111115 688.214966 1524.0 510.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign Benign 0

10 rows × 60 columns

In [33]:
cic_df['isMalicious'].value_counts()
Out[33]:
isMalicious
0    7185881
1    1981390
Name: count, dtype: int64
In [34]:
cic_df['ClassLabel'].value_counts()
Out[34]:
ClassLabel
Benign          7185881
DDoS            1234727
DoS              397344
Botnet           145968
Bruteforce       103244
Infiltration      94857
Webattack          2995
Portscan           2255
Name: count, dtype: int64
In [35]:
cic_df.shape
Out[35]:
(9167271, 60)
In [36]:
cic_df=cic_df.drop(['Label'],axis=1)
In [37]:
cic_df.shape
Out[37]:
(9167271, 59)
  • We have created a new feature: isMalicious, which is binary. We will use this for Binary classification.

  • We will use the feature: ClassLabel for Multi-class classification, to determine the type of attack.

  • We have dropped the feature: Label, because it gives futher sub-type of the attack, which will not be in the scope of our work.

  • As the result, our two target features in the dataset are: -

    • isMalicious: For binary classification
    • ClassLabel: For multi-class classification

Since the original dataset is too large, carrying out analysis over the complete dataset leads to over utilization of system's memory and the notebook stalls to work.

As the result, we will create a sample of the original dataset to carryout all the analysis.

We will take 20% of the original dataset as the sample size.

In [38]:
sample_size=int(0.2*len(cic_df))
sampled_cic_df=cic_df.sample(n=sample_size, replace=False, random_state=42)
sampled_cic_df.shape
Out[38]:
(1833454, 59)
In [39]:
sampled_cic_df['isMalicious'].value_counts()
Out[39]:
isMalicious
0    1437467
1     395987
Name: count, dtype: int64
In [40]:
sampled_cic_df['ClassLabel'].value_counts()
Out[40]:
ClassLabel
Benign          1437467
DDoS             246982
DoS               79186
Botnet            29348
Bruteforce        20546
Infiltration      18870
Webattack           625
Portscan            430
Name: count, dtype: int64
  • In the sampled dataset, the imbalanced nature of target is very similar to imbalanced nature of original dataset.
  • In the sampled dataset, all the categories under the column: ClassLabel are observed in the same order as the original dataset.
In [41]:
sampled_cic_df.dtypes
Out[41]:
Flow Duration               float64
Total Fwd Packets             int32
Total Backward Packets        int32
Fwd Packets Length Total    float64
Bwd Packets Length Total    float64
Fwd Packet Length Max       float64
Fwd Packet Length Mean      float32
Fwd Packet Length Std       float32
Bwd Packet Length Max       float64
Bwd Packet Length Mean      float32
Bwd Packet Length Std       float32
Flow Bytes/s                float64
Flow Packets/s              float64
Flow IAT Mean               float64
Flow IAT Std                float32
Flow IAT Max                float64
Flow IAT Min                float64
Fwd IAT Total               float64
Fwd IAT Mean                float64
Fwd IAT Std                 float32
Fwd IAT Max                 float64
Fwd IAT Min                 float64
Bwd IAT Total               float64
Bwd IAT Mean                float32
Bwd IAT Std                 float32
Bwd IAT Max                 float64
Bwd IAT Min                 float64
Fwd PSH Flags                  int8
Fwd Header Length           float64
Bwd Header Length           float64
Fwd Packets/s               float32
Bwd Packets/s               float32
Packet Length Max           float64
Packet Length Mean          float32
Packet Length Std           float32
Packet Length Variance      float32
SYN Flag Count                 int8
URG Flag Count                 int8
Avg Packet Size             float32
Avg Fwd Segment Size        float32
Avg Bwd Segment Size        float32
Subflow Fwd Packets           int32
Subflow Fwd Bytes             int32
Subflow Bwd Packets           int32
Subflow Bwd Bytes             int32
Init Fwd Win Bytes          float64
Init Bwd Win Bytes          float64
Fwd Act Data Packets          int32
Fwd Seg Size Min            float64
Active Mean                 float32
Active Std                  float32
Active Max                  float64
Active Min                  float64
Idle Mean                   float32
Idle Std                    float32
Idle Max                    float64
Idle Min                    float64
ClassLabel                   object
isMalicious                   int32
dtype: object
In [42]:
#Identifying outliers in sampled dataset
independent_features=sampled_cic_df.copy()
independent_features=independent_features.drop(['ClassLabel','isMalicious'],axis=1)
q1=independent_features.quantile(0.25)
q3=independent_features.quantile(0.75)
iqr=q3-q1
outlier=(independent_features<(q1-1.5*iqr))|(independent_features>(q3+1.5*iqr))
outlier_count=outlier.sum()
outlier_percentage=round(outlier.mean() * 100, 2)
outlier_stats=pd.concat([outlier_count, outlier_percentage], axis=1)
outlier_stats.columns = ['Outlier Count', 'Outlier Percentage']
print(outlier_stats)
                          Outlier Count  Outlier Percentage
Flow Duration                    362139               19.75
Total Fwd Packets                167485                9.13
Total Backward Packets           176328                9.62
Fwd Packets Length Total          71763                3.91
Bwd Packets Length Total         265389               14.47
Fwd Packet Length Max             24476                1.33
Fwd Packet Length Mean            74245                4.05
Fwd Packet Length Std             21018                1.15
Bwd Packet Length Max             69888                3.81
Bwd Packet Length Mean           140674                7.67
Bwd Packet Length Std             56299                3.07
Flow Bytes/s                     377550               20.59
Flow Packets/s                   380170               20.74
Flow IAT Mean                    346826               18.92
Flow IAT Std                     284585               15.52
Flow IAT Max                     255816               13.95
Flow IAT Min                     404158               22.04
Fwd IAT Total                    352629               19.23
Fwd IAT Mean                     355920               19.41
Fwd IAT Std                      395445               21.57
Fwd IAT Max                      252256               13.76
Fwd IAT Min                      420002               22.91
Bwd IAT Total                    316674               17.27
Bwd IAT Mean                     257401               14.04
Bwd IAT Std                      291284               15.89
Bwd IAT Max                      266733               14.55
Bwd IAT Min                      389592               21.25
Fwd PSH Flags                     57754                3.15
Fwd Header Length                127832                6.97
Bwd Header Length                147846                8.06
Fwd Packets/s                    379750               20.71
Bwd Packets/s                    381136               20.79
Packet Length Max                 77407                4.22
Packet Length Mean               179993                9.82
Packet Length Std                 68317                3.73
Packet Length Variance           160010                8.73
SYN Flag Count                    74451                4.06
URG Flag Count                    66218                3.61
Avg Packet Size                  177462                9.68
Avg Fwd Segment Size              74245                4.05
Avg Bwd Segment Size             140674                7.67
Subflow Fwd Packets              167485                9.13
Subflow Fwd Bytes                 71763                3.91
Subflow Bwd Packets              176328                9.62
Subflow Bwd Bytes                265389               14.47
Init Fwd Win Bytes               719371               39.24
Init Bwd Win Bytes               684281               37.32
Fwd Act Data Packets             122997                6.71
Fwd Seg Size Min                 678664               37.02
Active Mean                      266956               14.56
Active Std                       150242                8.19
Active Max                       266956               14.56
Active Min                       266956               14.56
Idle Mean                        385990               21.05
Idle Std                         173591                9.47
Idle Max                         385990               21.05
Idle Min                         385990               21.05
In [43]:
#Fetching outliers grouped by isMalicious
outlier_counts = {}
for i in independent_features:
    for attack_type in sampled_cic_df['isMalicious'].unique():
        attack_data = sampled_cic_df[i][sampled_cic_df['isMalicious'] == attack_type]
        q1, q3 = np.percentile(attack_data, [25, 75])
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        num_outliers = ((attack_data < lower_bound) | (attack_data > upper_bound)).sum()
        outlier_percent = num_outliers / len(attack_data) * 100
        outlier_counts[(i, attack_type)] = (num_outliers, outlier_percent)

for i in independent_features:
  print(f'Feature: {i}')
  for attack_type in sampled_cic_df['isMalicious'].unique():
    num_outliers, outlier_percent = outlier_counts[(i, attack_type)]
    print(f'- {attack_type}: {num_outliers} ({outlier_percent:.2f}%)')
  print()
Feature: Flow Duration
- 0: 271501 (18.89%)
- 1: 69145 (17.46%)

Feature: Total Fwd Packets
- 0: 82873 (5.77%)
- 1: 48965 (12.37%)

Feature: Total Backward Packets
- 0: 87244 (6.07%)
- 1: 23486 (5.93%)

Feature: Fwd Packets Length Total
- 0: 39934 (2.78%)
- 1: 79983 (20.20%)

Feature: Bwd Packets Length Total
- 0: 177607 (12.36%)
- 1: 71509 (18.06%)

Feature: Fwd Packet Length Max
- 0: 10822 (0.75%)
- 1: 2362 (0.60%)

Feature: Fwd Packet Length Mean
- 0: 28829 (2.01%)
- 1: 54263 (13.70%)

Feature: Fwd Packet Length Std
- 0: 14881 (1.04%)
- 1: 533 (0.13%)

Feature: Bwd Packet Length Max
- 0: 27262 (1.90%)
- 1: 48455 (12.24%)

Feature: Bwd Packet Length Mean
- 0: 121408 (8.45%)
- 1: 49662 (12.54%)

Feature: Bwd Packet Length Std
- 0: 40833 (2.84%)
- 1: 48473 (12.24%)

Feature: Flow Bytes/s
- 0: 319778 (22.25%)
- 1: 57618 (14.55%)

Feature: Flow Packets/s
- 0: 301763 (20.99%)
- 1: 51261 (12.95%)

Feature: Flow IAT Mean
- 0: 258296 (17.97%)
- 1: 51632 (13.04%)

Feature: Flow IAT Std
- 0: 186774 (12.99%)
- 1: 80205 (20.25%)

Feature: Flow IAT Max
- 0: 166932 (11.61%)
- 1: 67261 (16.99%)

Feature: Flow IAT Min
- 0: 308112 (21.43%)
- 1: 95849 (24.21%)

Feature: Fwd IAT Total
- 0: 260387 (18.11%)
- 1: 74387 (18.79%)

Feature: Fwd IAT Mean
- 0: 252056 (17.53%)
- 1: 69485 (17.55%)

Feature: Fwd IAT Std
- 0: 310032 (21.57%)
- 1: 91566 (23.12%)

Feature: Fwd IAT Max
- 0: 159866 (11.12%)
- 1: 69246 (17.49%)

Feature: Fwd IAT Min
- 0: 324450 (22.57%)
- 1: 94919 (23.97%)

Feature: Bwd IAT Total
- 0: 251328 (17.48%)
- 1: 44968 (11.36%)

Feature: Bwd IAT Mean
- 0: 226066 (15.73%)
- 1: 62888 (15.88%)

Feature: Bwd IAT Std
- 0: 223055 (15.52%)
- 1: 80501 (20.33%)

Feature: Bwd IAT Max
- 0: 236348 (16.44%)
- 1: 49815 (12.58%)

Feature: Bwd IAT Min
- 0: 336707 (23.42%)
- 1: 32229 (8.14%)

Feature: Fwd PSH Flags
- 0: 54784 (3.81%)
- 1: 2970 (0.75%)

Feature: Fwd Header Length
- 0: 83369 (5.80%)
- 1: 60675 (15.32%)

Feature: Bwd Header Length
- 0: 115152 (8.01%)
- 1: 34435 (8.70%)

Feature: Fwd Packets/s
- 0: 298236 (20.75%)
- 1: 51461 (13.00%)

Feature: Bwd Packets/s
- 0: 313900 (21.84%)
- 1: 74384 (18.78%)

Feature: Packet Length Max
- 0: 28885 (2.01%)
- 1: 48465 (12.24%)

Feature: Packet Length Mean
- 0: 85914 (5.98%)
- 1: 26172 (6.61%)

Feature: Packet Length Std
- 0: 20542 (1.43%)
- 1: 48486 (12.24%)

Feature: Packet Length Variance
- 0: 111343 (7.75%)
- 1: 49627 (12.53%)

Feature: SYN Flag Count
- 0: 69042 (4.80%)
- 1: 5409 (1.37%)

Feature: URG Flag Count
- 0: 63194 (4.40%)
- 1: 3024 (0.76%)

Feature: Avg Packet Size
- 0: 78454 (5.46%)
- 1: 18905 (4.77%)

Feature: Avg Fwd Segment Size
- 0: 28829 (2.01%)
- 1: 54263 (13.70%)

Feature: Avg Bwd Segment Size
- 0: 121408 (8.45%)
- 1: 49662 (12.54%)

Feature: Subflow Fwd Packets
- 0: 82873 (5.77%)
- 1: 48965 (12.37%)

Feature: Subflow Fwd Bytes
- 0: 39934 (2.78%)
- 1: 79983 (20.20%)

Feature: Subflow Bwd Packets
- 0: 87244 (6.07%)
- 1: 23486 (5.93%)

Feature: Subflow Bwd Bytes
- 0: 177607 (12.36%)
- 1: 71509 (18.06%)

Feature: Init Fwd Win Bytes
- 0: 493994 (34.37%)
- 1: 76819 (19.40%)

Feature: Init Bwd Win Bytes
- 0: 349532 (24.32%)
- 1: 24436 (6.17%)

Feature: Fwd Act Data Packets
- 0: 76932 (5.35%)
- 1: 50263 (12.69%)

Feature: Fwd Seg Size Min
- 0: 562595 (39.14%)
- 1: 116069 (29.31%)

Feature: Active Mean
- 0: 207682 (14.45%)
- 1: 59274 (14.97%)

Feature: Active Std
- 0: 138991 (9.67%)
- 1: 11251 (2.84%)

Feature: Active Max
- 0: 207682 (14.45%)
- 1: 59274 (14.97%)

Feature: Active Min
- 0: 207682 (14.45%)
- 1: 59274 (14.97%)

Feature: Idle Mean
- 0: 272236 (18.94%)
- 1: 64936 (16.40%)

Feature: Idle Std
- 0: 156188 (10.87%)
- 1: 17403 (4.39%)

Feature: Idle Max
- 0: 272236 (18.94%)
- 1: 67834 (17.13%)

Feature: Idle Min
- 0: 272236 (18.94%)
- 1: 68185 (17.22%)

In [44]:
#Fetching outliers grouped by ClassLabel
outlier_counts = {}
for i in independent_features:
    for attack_type in sampled_cic_df['ClassLabel'].unique():
        attack_data = sampled_cic_df[i][sampled_cic_df['ClassLabel'] == attack_type]
        q1, q3 = np.percentile(attack_data, [25, 75])
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        num_outliers = ((attack_data < lower_bound) | (attack_data > upper_bound)).sum()
        outlier_percent = num_outliers / len(attack_data) * 100
        outlier_counts[(i, attack_type)] = (num_outliers, outlier_percent)

for i in independent_features:
  print(f'Feature: {i}')
  for attack_type in sampled_cic_df['ClassLabel'].unique():
    num_outliers, outlier_percent = outlier_counts[(i, attack_type)]
    print(f'- {attack_type}: {num_outliers} ({outlier_percent:.2f}%)')
  print()
Feature: Flow Duration
- Benign: 271501 (18.89%)
- DDoS: 39367 (15.94%)
- DoS: 0 (0.00%)
- Infiltration: 3981 (21.10%)
- Botnet: 3135 (10.68%)
- Bruteforce: 2609 (12.70%)
- Portscan: 94 (21.86%)
- Webattack: 200 (32.00%)

Feature: Total Fwd Packets
- Benign: 82873 (5.77%)
- DDoS: 35630 (14.43%)
- DoS: 1542 (1.95%)
- Infiltration: 1110 (5.88%)
- Botnet: 1419 (4.84%)
- Bruteforce: 2602 (12.66%)
- Portscan: 34 (7.91%)
- Webattack: 228 (36.48%)

Feature: Total Backward Packets
- Benign: 87244 (6.07%)
- DDoS: 199 (0.08%)
- DoS: 85 (0.11%)
- Infiltration: 1194 (6.33%)
- Botnet: 1540 (5.25%)
- Bruteforce: 1912 (9.31%)
- Portscan: 163 (37.91%)
- Webattack: 211 (33.76%)

Feature: Fwd Packets Length Total
- Benign: 39934 (2.78%)
- DDoS: 53978 (21.86%)
- DoS: 3215 (4.06%)
- Infiltration: 504 (2.67%)
- Botnet: 1937 (6.60%)
- Bruteforce: 2495 (12.14%)
- Portscan: 70 (16.28%)
- Webattack: 149 (23.84%)

Feature: Bwd Packets Length Total
- Benign: 177607 (12.36%)
- DDoS: 16430 (6.65%)
- DoS: 24 (0.03%)
- Infiltration: 2138 (11.33%)
- Botnet: 1610 (5.49%)
- Bruteforce: 1913 (9.31%)
- Portscan: 36 (8.37%)
- Webattack: 122 (19.52%)

Feature: Fwd Packet Length Max
- Benign: 10822 (0.75%)
- DDoS: 1658 (0.67%)
- DoS: 117 (0.15%)
- Infiltration: 575 (3.05%)
- Botnet: 1937 (6.60%)
- Bruteforce: 1330 (6.47%)
- Portscan: 75 (17.44%)
- Webattack: 149 (23.84%)

Feature: Fwd Packet Length Mean
- Benign: 28829 (2.01%)
- DDoS: 53668 (21.73%)
- DoS: 1100 (1.39%)
- Infiltration: 707 (3.75%)
- Botnet: 1937 (6.60%)
- Bruteforce: 1822 (8.87%)
- Portscan: 65 (15.12%)
- Webattack: 149 (23.84%)

Feature: Fwd Packet Length Std
- Benign: 14881 (1.04%)
- DDoS: 44707 (18.10%)
- DoS: 137 (0.17%)
- Infiltration: 351 (1.86%)
- Botnet: 1937 (6.60%)
- Bruteforce: 2556 (12.44%)
- Portscan: 30 (6.98%)
- Webattack: 122 (19.52%)

Feature: Bwd Packet Length Max
- Benign: 27262 (1.90%)
- DDoS: 16149 (6.54%)
- DoS: 1 (0.00%)
- Infiltration: 0 (0.00%)
- Botnet: 1610 (5.49%)
- Bruteforce: 1327 (6.46%)
- Portscan: 36 (8.37%)
- Webattack: 122 (19.52%)

Feature: Bwd Packet Length Mean
- Benign: 121408 (8.45%)
- DDoS: 16161 (6.54%)
- DoS: 0 (0.00%)
- Infiltration: 1013 (5.37%)
- Botnet: 1764 (6.01%)
- Bruteforce: 1908 (9.29%)
- Portscan: 34 (7.91%)
- Webattack: 122 (19.52%)

Feature: Bwd Packet Length Std
- Benign: 40833 (2.84%)
- DDoS: 16141 (6.54%)
- DoS: 402 (0.51%)
- Infiltration: 74 (0.39%)
- Botnet: 1764 (6.01%)
- Bruteforce: 1911 (9.30%)
- Portscan: 36 (8.37%)
- Webattack: 122 (19.52%)

Feature: Flow Bytes/s
- Benign: 319778 (22.25%)
- DDoS: 44053 (17.84%)
- DoS: 10132 (12.80%)
- Infiltration: 4291 (22.74%)
- Botnet: 2845 (9.69%)
- Bruteforce: 3082 (15.00%)
- Portscan: 77 (17.91%)
- Webattack: 149 (23.84%)

Feature: Flow Packets/s
- Benign: 301763 (20.99%)
- DDoS: 43821 (17.74%)
- DoS: 9453 (11.94%)
- Infiltration: 3039 (16.10%)
- Botnet: 2813 (9.58%)
- Bruteforce: 2635 (12.82%)
- Portscan: 73 (16.98%)
- Webattack: 104 (16.64%)

Feature: Flow IAT Mean
- Benign: 258296 (17.97%)
- DDoS: 44064 (17.84%)
- DoS: 2243 (2.83%)
- Infiltration: 3717 (19.70%)
- Botnet: 3104 (10.58%)
- Bruteforce: 2283 (11.11%)
- Portscan: 94 (21.86%)
- Webattack: 28 (4.48%)

Feature: Flow IAT Std
- Benign: 186774 (12.99%)
- DDoS: 40394 (16.36%)
- DoS: 402 (0.51%)
- Infiltration: 3332 (17.66%)
- Botnet: 3246 (11.06%)
- Bruteforce: 2107 (10.26%)
- Portscan: 91 (21.16%)
- Webattack: 27 (4.32%)

Feature: Flow IAT Max
- Benign: 166932 (11.61%)
- DDoS: 35723 (14.46%)
- DoS: 0 (0.00%)
- Infiltration: 1925 (10.20%)
- Botnet: 3199 (10.90%)
- Bruteforce: 2089 (10.17%)
- Portscan: 93 (21.63%)
- Webattack: 184 (29.44%)

Feature: Flow IAT Min
- Benign: 308112 (21.43%)
- DDoS: 58129 (23.54%)
- DoS: 8191 (10.34%)
- Infiltration: 3260 (17.28%)
- Botnet: 444 (1.51%)
- Bruteforce: 658 (3.20%)
- Portscan: 27 (6.28%)
- Webattack: 44 (7.04%)

Feature: Fwd IAT Total
- Benign: 260387 (18.11%)
- DDoS: 39677 (16.06%)
- DoS: 0 (0.00%)
- Infiltration: 3643 (19.31%)
- Botnet: 2040 (6.95%)
- Bruteforce: 2630 (12.80%)
- Portscan: 92 (21.40%)
- Webattack: 211 (33.76%)

Feature: Fwd IAT Mean
- Benign: 252056 (17.53%)
- DDoS: 41957 (16.99%)
- DoS: 1833 (2.31%)
- Infiltration: 3998 (21.19%)
- Botnet: 2218 (7.56%)
- Bruteforce: 2114 (10.29%)
- Portscan: 87 (20.23%)
- Webattack: 28 (4.48%)

Feature: Fwd IAT Std
- Benign: 310032 (21.57%)
- DDoS: 49536 (20.06%)
- DoS: 0 (0.00%)
- Infiltration: 4127 (21.87%)
- Botnet: 2258 (7.69%)
- Bruteforce: 2048 (9.97%)
- Portscan: 42 (9.77%)
- Webattack: 27 (4.32%)

Feature: Fwd IAT Max
- Benign: 159866 (11.12%)
- DDoS: 36899 (14.94%)
- DoS: 0 (0.00%)
- Infiltration: 3437 (18.21%)
- Botnet: 2002 (6.82%)
- Bruteforce: 2124 (10.34%)
- Portscan: 92 (21.40%)
- Webattack: 28 (4.48%)

Feature: Fwd IAT Min
- Benign: 324450 (22.57%)
- DDoS: 57863 (23.43%)
- DoS: 8128 (10.26%)
- Infiltration: 4453 (23.60%)
- Botnet: 5116 (17.43%)
- Bruteforce: 947 (4.61%)
- Portscan: 52 (12.09%)
- Webattack: 37 (5.92%)

Feature: Bwd IAT Total
- Benign: 251328 (17.48%)
- DDoS: 4797 (1.94%)
- DoS: 14913 (18.83%)
- Infiltration: 3845 (20.38%)
- Botnet: 3853 (13.13%)
- Bruteforce: 2609 (12.70%)
- Portscan: 106 (24.65%)
- Webattack: 122 (19.52%)

Feature: Bwd IAT Mean
- Benign: 226066 (15.73%)
- DDoS: 4753 (1.92%)
- DoS: 15771 (19.92%)
- Infiltration: 3809 (20.19%)
- Botnet: 3866 (13.17%)
- Bruteforce: 2034 (9.90%)
- Portscan: 106 (24.65%)
- Webattack: 122 (19.52%)

Feature: Bwd IAT Std
- Benign: 223055 (15.52%)
- DDoS: 4616 (1.87%)
- DoS: 19481 (24.60%)
- Infiltration: 3904 (20.69%)
- Botnet: 3885 (13.24%)
- Bruteforce: 2211 (10.76%)
- Portscan: 28 (6.51%)
- Webattack: 122 (19.52%)

Feature: Bwd IAT Max
- Benign: 236348 (16.44%)
- DDoS: 4705 (1.90%)
- DoS: 12206 (15.41%)
- Infiltration: 4209 (22.31%)
- Botnet: 3844 (13.10%)
- Bruteforce: 2125 (10.34%)
- Portscan: 106 (24.65%)
- Webattack: 122 (19.52%)

Feature: Bwd IAT Min
- Benign: 336707 (23.42%)
- DDoS: 7815 (3.16%)
- DoS: 12807 (16.17%)
- Infiltration: 3848 (20.39%)
- Botnet: 836 (2.85%)
- Bruteforce: 1409 (6.86%)
- Portscan: 106 (24.65%)
- Webattack: 122 (19.52%)

Feature: Fwd PSH Flags
- Benign: 54784 (3.81%)
- DDoS: 6 (0.00%)
- DoS: 944 (1.19%)
- Infiltration: 1598 (8.47%)
- Botnet: 1 (0.00%)
- Bruteforce: 421 (2.05%)
- Portscan: 0 (0.00%)
- Webattack: 0 (0.00%)

Feature: Fwd Header Length
- Benign: 83369 (5.80%)
- DDoS: 28846 (11.68%)
- DoS: 1957 (2.47%)
- Infiltration: 808 (4.28%)
- Botnet: 1535 (5.23%)
- Bruteforce: 2602 (12.66%)
- Portscan: 2 (0.47%)
- Webattack: 237 (37.92%)

Feature: Bwd Header Length
- Benign: 115152 (8.01%)
- DDoS: 198 (0.08%)
- DoS: 84 (0.11%)
- Infiltration: 1574 (8.34%)
- Botnet: 1540 (5.25%)
- Bruteforce: 1912 (9.31%)
- Portscan: 39 (9.07%)
- Webattack: 237 (37.92%)

Feature: Fwd Packets/s
- Benign: 298236 (20.75%)
- DDoS: 43502 (17.61%)
- DoS: 7368 (9.30%)
- Infiltration: 2986 (15.82%)
- Botnet: 3398 (11.58%)
- Bruteforce: 2352 (11.45%)
- Portscan: 77 (17.91%)
- Webattack: 104 (16.64%)

Feature: Bwd Packets/s
- Benign: 313900 (21.84%)
- DDoS: 40283 (16.31%)
- DoS: 16836 (21.26%)
- Infiltration: 3723 (19.73%)
- Botnet: 3438 (11.71%)
- Bruteforce: 2355 (11.46%)
- Portscan: 68 (15.81%)
- Webattack: 237 (37.92%)

Feature: Packet Length Max
- Benign: 28885 (2.01%)
- DDoS: 16149 (6.54%)
- DoS: 1 (0.00%)
- Infiltration: 0 (0.00%)
- Botnet: 1150 (3.92%)
- Bruteforce: 1328 (6.46%)
- Portscan: 192 (44.65%)
- Webattack: 149 (23.84%)

Feature: Packet Length Mean
- Benign: 85914 (5.98%)
- DDoS: 13035 (5.28%)
- DoS: 0 (0.00%)
- Infiltration: 580 (3.07%)
- Botnet: 1377 (4.69%)
- Bruteforce: 3835 (18.67%)
- Portscan: 201 (46.74%)
- Webattack: 149 (23.84%)

Feature: Packet Length Std
- Benign: 20542 (1.43%)
- DDoS: 16149 (6.54%)
- DoS: 28 (0.04%)
- Infiltration: 17 (0.09%)
- Botnet: 1377 (4.69%)
- Bruteforce: 3530 (17.18%)
- Portscan: 36 (8.37%)
- Webattack: 122 (19.52%)

Feature: Packet Length Variance
- Benign: 111343 (7.75%)
- DDoS: 16161 (6.54%)
- DoS: 2390 (3.02%)
- Infiltration: 1414 (7.49%)
- Botnet: 1377 (4.69%)
- Bruteforce: 3530 (17.18%)
- Portscan: 40 (9.30%)
- Webattack: 122 (19.52%)

Feature: SYN Flag Count
- Benign: 69042 (4.80%)
- DDoS: 1256 (0.51%)
- DoS: 2133 (2.69%)
- Infiltration: 1598 (8.47%)
- Botnet: 1 (0.00%)
- Bruteforce: 421 (2.05%)
- Portscan: 0 (0.00%)
- Webattack: 0 (0.00%)

Feature: URG Flag Count
- Benign: 63194 (4.40%)
- DDoS: 96 (0.04%)
- DoS: 2260 (2.85%)
- Infiltration: 497 (2.63%)
- Botnet: 41 (0.14%)
- Bruteforce: 96 (0.47%)
- Portscan: 9 (2.09%)
- Webattack: 25 (4.00%)

Feature: Avg Packet Size
- Benign: 78454 (5.46%)
- DDoS: 13314 (5.39%)
- DoS: 0 (0.00%)
- Infiltration: 648 (3.43%)
- Botnet: 1377 (4.69%)
- Bruteforce: 3835 (18.67%)
- Portscan: 73 (16.98%)
- Webattack: 149 (23.84%)

Feature: Avg Fwd Segment Size
- Benign: 28829 (2.01%)
- DDoS: 53668 (21.73%)
- DoS: 1100 (1.39%)
- Infiltration: 707 (3.75%)
- Botnet: 1937 (6.60%)
- Bruteforce: 1822 (8.87%)
- Portscan: 65 (15.12%)
- Webattack: 149 (23.84%)

Feature: Avg Bwd Segment Size
- Benign: 121408 (8.45%)
- DDoS: 16161 (6.54%)
- DoS: 0 (0.00%)
- Infiltration: 1013 (5.37%)
- Botnet: 1764 (6.01%)
- Bruteforce: 1908 (9.29%)
- Portscan: 34 (7.91%)
- Webattack: 122 (19.52%)

Feature: Subflow Fwd Packets
- Benign: 82873 (5.77%)
- DDoS: 35630 (14.43%)
- DoS: 1542 (1.95%)
- Infiltration: 1110 (5.88%)
- Botnet: 1419 (4.84%)
- Bruteforce: 2602 (12.66%)
- Portscan: 34 (7.91%)
- Webattack: 228 (36.48%)

Feature: Subflow Fwd Bytes
- Benign: 39934 (2.78%)
- DDoS: 53978 (21.86%)
- DoS: 3215 (4.06%)
- Infiltration: 504 (2.67%)
- Botnet: 1937 (6.60%)
- Bruteforce: 2495 (12.14%)
- Portscan: 70 (16.28%)
- Webattack: 149 (23.84%)

Feature: Subflow Bwd Packets
- Benign: 87244 (6.07%)
- DDoS: 199 (0.08%)
- DoS: 85 (0.11%)
- Infiltration: 1194 (6.33%)
- Botnet: 1540 (5.25%)
- Bruteforce: 1912 (9.31%)
- Portscan: 163 (37.91%)
- Webattack: 211 (33.76%)

Feature: Subflow Bwd Bytes
- Benign: 177607 (12.36%)
- DDoS: 16430 (6.65%)
- DoS: 24 (0.03%)
- Infiltration: 2138 (11.33%)
- Botnet: 1610 (5.49%)
- Bruteforce: 1913 (9.31%)
- Portscan: 36 (8.37%)
- Webattack: 122 (19.52%)

Feature: Init Fwd Win Bytes
- Benign: 493994 (34.37%)
- DDoS: 39933 (16.17%)
- DoS: 29554 (37.32%)
- Infiltration: 4293 (22.75%)
- Botnet: 279 (0.95%)
- Bruteforce: 1902 (9.26%)
- Portscan: 0 (0.00%)
- Webattack: 0 (0.00%)

Feature: Init Bwd Win Bytes
- Benign: 349532 (24.32%)
- DDoS: 9377 (3.80%)
- DoS: 14328 (18.09%)
- Infiltration: 4318 (22.88%)
- Botnet: 1557 (5.31%)
- Bruteforce: 1873 (9.12%)
- Portscan: 0 (0.00%)
- Webattack: 0 (0.00%)

Feature: Fwd Act Data Packets
- Benign: 76932 (5.35%)
- DDoS: 27532 (11.15%)
- DoS: 9955 (12.57%)
- Infiltration: 772 (4.09%)
- Botnet: 1535 (5.23%)
- Bruteforce: 1345 (6.55%)
- Portscan: 9 (2.09%)
- Webattack: 149 (23.84%)

Feature: Fwd Seg Size Min
- Benign: 562595 (39.14%)
- DDoS: 25318 (10.25%)
- DoS: 20310 (25.65%)
- Infiltration: 447 (2.37%)
- Botnet: 173 (0.59%)
- Bruteforce: 11 (0.05%)
- Portscan: 5 (1.16%)
- Webattack: 1 (0.16%)

Feature: Active Mean
- Benign: 207682 (14.45%)
- DDoS: 19286 (7.81%)
- DoS: 14493 (18.30%)
- Infiltration: 4527 (23.99%)
- Botnet: 50 (0.17%)
- Bruteforce: 0 (0.00%)
- Portscan: 61 (14.19%)
- Webattack: 27 (4.32%)

Feature: Active Std
- Benign: 138991 (9.67%)
- DDoS: 6682 (2.71%)
- DoS: 1859 (2.35%)
- Infiltration: 2659 (14.09%)
- Botnet: 48 (0.16%)
- Bruteforce: 0 (0.00%)
- Portscan: 3 (0.70%)
- Webattack: 0 (0.00%)

Feature: Active Max
- Benign: 207682 (14.45%)
- DDoS: 19286 (7.81%)
- DoS: 14510 (18.32%)
- Infiltration: 4527 (23.99%)
- Botnet: 50 (0.17%)
- Bruteforce: 0 (0.00%)
- Portscan: 61 (14.19%)
- Webattack: 27 (4.32%)

Feature: Active Min
- Benign: 207682 (14.45%)
- DDoS: 19286 (7.81%)
- DoS: 13659 (17.25%)
- Infiltration: 4527 (23.99%)
- Botnet: 50 (0.17%)
- Bruteforce: 0 (0.00%)
- Portscan: 61 (14.19%)
- Webattack: 27 (4.32%)

Feature: Idle Mean
- Benign: 272236 (18.94%)
- DDoS: 33705 (13.65%)
- DoS: 0 (0.00%)
- Infiltration: 1829 (9.69%)
- Botnet: 50 (0.17%)
- Bruteforce: 0 (0.00%)
- Portscan: 61 (14.19%)
- Webattack: 28 (4.48%)

Feature: Idle Std
- Benign: 156188 (10.87%)
- DDoS: 10516 (4.26%)
- DoS: 3548 (4.48%)
- Infiltration: 3260 (17.28%)
- Botnet: 48 (0.16%)
- Bruteforce: 0 (0.00%)
- Portscan: 3 (0.70%)
- Webattack: 28 (4.48%)

Feature: Idle Max
- Benign: 272236 (18.94%)
- DDoS: 35712 (14.46%)
- DoS: 0 (0.00%)
- Infiltration: 1921 (10.18%)
- Botnet: 50 (0.17%)
- Bruteforce: 0 (0.00%)
- Portscan: 61 (14.19%)
- Webattack: 28 (4.48%)

Feature: Idle Min
- Benign: 272236 (18.94%)
- DDoS: 35579 (14.41%)
- DoS: 0 (0.00%)
- Infiltration: 1680 (8.90%)
- Botnet: 50 (0.17%)
- Bruteforce: 0 (0.00%)
- Portscan: 61 (14.19%)
- Webattack: 28 (4.48%)

Based on observations from sampled dataset, out of 57 independent features, 12 features have outliers whose percentage of difference betweem Malicious and Benign events is greater than or equal to 10%.

  1. Fwd Packets Length Total: Malicious > Benign: Difference = 17%
  2. Bwd Packet Length Max: Malicious > Benign: Difference = 10%
  3. Bwd Packet Length Std: Malicious > Benign: Difference = 10%
  4. Bwd IAT Mean: Malicious < Benign: Difference = 15%
  5. Fwd Header Length: Malicious > Benign: Difference = 10%
  6. Packet Length Max: Malicious > Benign: Diffence = 10%
  7. Packet Length Std: Malicious > Benign: Difference = 11%
  8. Avg Fwd Segment Size: Malicious > Benign: Difference = 11%
  9. Subflow Fwd Bytes: Malicious > Benign: Difference = 18%
  10. Init Fwd Win Bytes: Malicious < Benign: Difference = 15%
  11. Init Bwd Win Bytes: Malicious < Benign: Difference = 18%
  12. Fwd Seg Size Min: Malicious < Benign: Difference = 10%

Remaining 45 features have very similar percentage (nearly equal percentage) of outliers labelled as Benign and Malicious.

Out of the 12 features listed above, we have following 4 features with higher outliers percentage: -

  1. Init Fwd Win Bytes: 39.24% records are outliers
  2. Init Bwd Win Bytes: 37.32% records are outliers
  3. Fwd Seg Size Min: 37.02% records are outliers
  4. Bwd IAT Mean: 14.04% records are outliers

Among the above 4 features, Benign records are more than Malicious. Thus, the features with relatively higher number of outliers do not indicate any anomaly or provide differentiation to detect Malicious events.

Remaining 8 features have less than or equal to 6% of records as outliers.

  1. Fwd Packets Length Total
  2. Bwd Packet Length Max
  3. Bwd Packet Length Std
  4. Fwd Header Length
  5. Packet Length Max
  6. Packet Length Std
  7. Avg Fwd Segment Size
  8. Subflow Fwd Bytes

All the above 8 features have Malicious events are more than Benign events. Thus, the features with relatively lesser number of outliers help to provide small differentiation of Malicious events over Benign events. As the result, they can also be useful as anomalies in dataset indicating towards Malicious events.

Thus, features with higher number of outliers have more number of benign events than malicious events. And features with lesser number of outliers have more number of malicious events than benign events. And all 12 features have outliers with difference betweeen the two categories greater than or equal to 10%.

To handle outliers, we shall try to use 2 methods: -

  1. Winsorization
  2. Robust Scaling

We shall perform winsorization on following 4 features: -

  1. Init Fwd Win Bytes
  2. Init Bwd Win Bytes
  3. Fwd Seg Size Min
  4. Bwd IAT Mean

Init Fwd Win Bytes

In [45]:
plt.hist(sampled_cic_df["Init Fwd Win Bytes"], edgecolor="black", log=True)
plt.xlabel("Init Fwd Win Bytes")
plt.ylabel("Frequency")
plt.title("Sampled data, log transformed histogram: Init Fwd Win Bytes")
plt.show()
No description has been provided for this image
In [46]:
upper_limit=sampled_cic_df["Init Fwd Win Bytes"].quantile(0.95)
lower_limit=sampled_cic_df["Init Fwd Win Bytes"].quantile(0.05)
print("Upper limit: ",upper_limit)
print("Lower limit: ",lower_limit)
print("Maximum value: ",sampled_cic_df["Init Fwd Win Bytes"].max())
print("Minimum value: ",sampled_cic_df["Init Fwd Win Bytes"].min())
print("Median value: ",sampled_cic_df["Init Fwd Win Bytes"].median())
print("Standard deviation value: ",sampled_cic_df["Init Fwd Win Bytes"].std())
Upper limit:  65535.0
Lower limit:  219.0
Maximum value:  65535.0
Minimum value:  0.0
Median value:  8192.0
Standard deviation value:  17920.85671285462
In [47]:
new_df=pd.DataFrame()
new_df["Init Fwd Win Bytes"]=np.where(sampled_cic_df["Init Fwd Win Bytes"]>=upper_limit,
                upper_limit,
               np.where(sampled_cic_df["Init Fwd Win Bytes"]<=lower_limit,
               lower_limit,
               sampled_cic_df["Init Fwd Win Bytes"]))
In [48]:
print("Maximum value: ",new_df["Init Fwd Win Bytes"].max())
print("Minimum value: ",new_df["Init Fwd Win Bytes"].min())
print("Median value: ",new_df["Init Fwd Win Bytes"].median())
print("Standard deviation value: ",new_df["Init Fwd Win Bytes"].std())
print("Upper limit: ",new_df["Init Fwd Win Bytes"].quantile(0.95))
print("Lower limit: ",new_df["Init Fwd Win Bytes"].quantile(0.05))
Maximum value:  65535.0
Minimum value:  219.0
Median value:  8192.0
Standard deviation value:  17917.485550925405
Upper limit:  65535.0
Lower limit:  219.0
In [49]:
plt.hist(new_df["Init Fwd Win Bytes"], edgecolor="black", log=True)
plt.xlabel("Init Fwd Win Bytes")
plt.ylabel("Frequency")
plt.title("Sampled data, log transformed histogram after winsorization: Init Fwd Win Bytes")
plt.show()
No description has been provided for this image
In [50]:
#Plotting both the histograms in parallel for easier comparison
fig1=plt.figure()
ax1=plt.subplot(2,1,1)
ax1.hist(sampled_cic_df["Init Fwd Win Bytes"], edgecolor="black", log=True)
ax1.set_xlabel("Init Fwd Win Bytes")
ax1.set_ylabel("Frequency")
ax1.set_title("Prior winsorization")
ax2=plt.subplot(2,1,2)
ax2.hist(new_df["Init Fwd Win Bytes"], edgecolor="black", log=True)
ax2.set_xlabel("Init Fwd Win Bytes")
ax2.set_ylabel("Frequency")
ax2.set_title("After winsorization")
plt.tight_layout()
plt.show()
No description has been provided for this image
  1. We observed after performing winsorization on sampled data for Init Fwd Win Bytes, the distribution in above histograms looks very similar to original sampled dataset's distribution.

  2. Median value has remained constant=8192.0

  3. Standard deviation value has slightly reduced from 17920 to 17917.

Init Bwd Win Bytes

In [51]:
plt.hist(sampled_cic_df["Init Bwd Win Bytes"], edgecolor="black", log=True)
plt.xlabel("Init Bwd Win Bytes")
plt.ylabel("Frequency")
plt.title("Sampled data, log transformed histogram: Init Bwd Win Bytes")
plt.show()
No description has been provided for this image
In [52]:
upper_limit=sampled_cic_df["Init Bwd Win Bytes"].quantile(0.95)
lower_limit=sampled_cic_df["Init Bwd Win Bytes"].quantile(0.05)
print("Upper limit: ",upper_limit)
print("Lower limit: ",lower_limit)
print("Maximum value: ",sampled_cic_df["Init Bwd Win Bytes"].max())
print("Minimum value: ",sampled_cic_df["Init Bwd Win Bytes"].min())
print("Median value: ",sampled_cic_df["Init Bwd Win Bytes"].median())
print("Standard deviation value: ",sampled_cic_df["Init Bwd Win Bytes"].std())
Upper limit:  62856.0
Lower limit:  31.0
Maximum value:  65535.0
Minimum value:  0.0
Median value:  235.0
Standard deviation value:  19414.847112640815
In [53]:
new_df["Init Bwd Win Bytes"]=np.where(sampled_cic_df["Init Bwd Win Bytes"]>=upper_limit,
                upper_limit,
               np.where(sampled_cic_df["Init Bwd Win Bytes"]<=lower_limit,
               lower_limit,
               sampled_cic_df["Init Bwd Win Bytes"]))
In [54]:
print("Maximum value: ",new_df["Init Bwd Win Bytes"].max())
print("Minimum value: ",new_df["Init Bwd Win Bytes"].min())
print("Median value: ",new_df["Init Bwd Win Bytes"].median())
print("Standard deviation value: ",new_df["Init Bwd Win Bytes"].std())
print("Upper limit: ",new_df["Init Bwd Win Bytes"].quantile(0.95))
print("Lower limit: ",new_df["Init Bwd Win Bytes"].quantile(0.05))
Maximum value:  62856.0
Minimum value:  31.0
Median value:  235.0
Standard deviation value:  19358.862080156487
Upper limit:  62856.0
Lower limit:  31.0
In [55]:
plt.hist(new_df["Init Bwd Win Bytes"], edgecolor="black", log=True)
plt.xlabel("Init Bwd Win Bytes")
plt.ylabel("Frequency")
plt.title("Sampled data, log transformed histogram after winsorization: Init Bwd Win Bytes")
plt.show()
No description has been provided for this image
In [56]:
#Plotting both the histograms in parallel for easier comparison
fig1=plt.figure()
ax1=plt.subplot(2,1,1)
ax1.hist(sampled_cic_df["Init Bwd Win Bytes"], edgecolor="black", log=True)
ax1.set_xlabel("Init Bwd Win Bytes")
ax1.set_ylabel("Frequency")
ax1.set_title("Prior winsorization")
ax2=plt.subplot(2,1,2)
ax2.hist(new_df["Init Bwd Win Bytes"], edgecolor="black", log=True)
ax2.set_xlabel("Init Bwd Win Bytes")
ax2.set_ylabel("Frequency")
ax2.set_title("After winsorization")
plt.tight_layout()
plt.show()
No description has been provided for this image
  1. We observed after performing winsorization on sampled data for Init Bwd Win Bytes, the distribution in above histograms looks very similar to original sampled dataset's distribution.

  2. Median value has remained constant=235.0

  3. Standard deviation value has slightly reduced from 19414 to 19358.

Fwd Seg Size Min

In [57]:
plt.hist(sampled_cic_df["Fwd Seg Size Min"], edgecolor="black", log=True)
plt.xlabel("Fwd Seg Size Min")
plt.ylabel("Frequency")
plt.title("Sampled data, log transformed histogram: Fwd Seg Size Min")
plt.show()
No description has been provided for this image
In [58]:
upper_limit=sampled_cic_df["Fwd Seg Size Min"].quantile(0.95)
lower_limit=sampled_cic_df["Fwd Seg Size Min"].quantile(0.05)
print("Upper limit: ",upper_limit)
print("Lower limit: ",lower_limit)
print("Maximum value: ",sampled_cic_df["Fwd Seg Size Min"].max())
print("Minimum value: ",sampled_cic_df["Fwd Seg Size Min"].min())
print("Median value: ",sampled_cic_df["Fwd Seg Size Min"].median())
print("Standard deviation value: ",sampled_cic_df["Fwd Seg Size Min"].std())
Upper limit:  32.0
Lower limit:  8.0
Maximum value:  1480.0
Minimum value:  0.0
Median value:  20.0
Standard deviation value:  25.971709364062733
In [59]:
new_df["Fwd Seg Size Min"]=np.where(sampled_cic_df["Fwd Seg Size Min"]>=upper_limit,
                upper_limit,
               np.where(sampled_cic_df["Fwd Seg Size Min"]<=lower_limit,
               lower_limit,
               sampled_cic_df["Fwd Seg Size Min"]))
In [60]:
print("Maximum value: ",new_df["Fwd Seg Size Min"].max())
print("Minimum value: ",new_df["Fwd Seg Size Min"].min())
print("Median value: ",new_df["Fwd Seg Size Min"].median())
print("Standard deviation value: ",new_df["Fwd Seg Size Min"].std())
print("Upper limit: ",new_df["Fwd Seg Size Min"].quantile(0.95))
print("Lower limit: ",new_df["Fwd Seg Size Min"].quantile(0.05))
Maximum value:  32.0
Minimum value:  8.0
Median value:  20.0
Standard deviation value:  7.262473158983896
Upper limit:  32.0
Lower limit:  8.0
In [61]:
plt.hist(new_df["Fwd Seg Size Min"], edgecolor="black", log=True)
plt.xlabel("Fwd Seg Size Min")
plt.ylabel("Frequency")
plt.title("Sampled data, log transformed histogram after winsorization: Fwd Seg Size Min")
plt.show()
No description has been provided for this image
In [62]:
#Plotting both the histograms in parallel for easier comparison
fig1=plt.figure()
ax1=plt.subplot(2,1,1)
ax1.hist(sampled_cic_df["Fwd Seg Size Min"], edgecolor="black", log=True)
ax1.set_xlabel("Fwd Seg Size Min")
ax1.set_ylabel("Frequency")
ax1.set_title("Prior winsorization")
ax2=plt.subplot(2,1,2)
ax2.hist(new_df["Fwd Seg Size Min"], edgecolor="black", log=True)
ax2.set_xlabel("Fwd Seg Size Min")
ax2.set_ylabel("Frequency")
ax2.set_title("After winsorization")
plt.tight_layout()
plt.show()
No description has been provided for this image
In [63]:
sampled_cic_df[sampled_cic_df['Fwd Seg Size Min'] > 32]['Fwd Seg Size Min'].count()
Out[63]:
17305
  1. We observed after performing winsorization on sampled data for Fwd Seg Size Min, the distribution has changed drastically compared to earlier histogram of sampled dataset's distribution.

  2. Median value has remained constant=20.0

  3. Standard deviation value has reduced from 25.97 to 7.26

  4. Maximum value in the original sampled dataset is 1480 and maximum value after winsorization is 32. Number of values in original sampled dataset between 32 and 1480 = 17305.

Bwd IAT Mean

In [64]:
plt.hist(sampled_cic_df["Bwd IAT Mean"], edgecolor="black", log=True)
plt.xlabel("Bwd IAT Mean")
plt.ylabel("Frequency")
plt.title("Sampled data, log transformed histogram: Bwd IAT Mean")
plt.show()
No description has been provided for this image
In [65]:
upper_limit=sampled_cic_df["Bwd IAT Mean"].quantile(0.95)
lower_limit=sampled_cic_df["Bwd IAT Mean"].quantile(0.05)
print("Upper limit: ",upper_limit)
print("Lower limit: ",lower_limit)
print("Maximum value: ",sampled_cic_df["Bwd IAT Mean"].max())
print("Minimum value: ",sampled_cic_df["Bwd IAT Mean"].min())
print("Median value: ",sampled_cic_df["Bwd IAT Mean"].median())
print("Standard deviation value: ",sampled_cic_df["Bwd IAT Mean"].std())
Upper limit:  6501928.974999998
Lower limit:  0.0
Maximum value:  120000000.0
Minimum value:  0.0
Median value:  647.0
Standard deviation value:  6192044.5
In [66]:
new_df["Bwd IAT Mean"]=np.where(sampled_cic_df["Bwd IAT Mean"]>=upper_limit,
                upper_limit,
               np.where(sampled_cic_df["Bwd IAT Mean"]<=lower_limit,
               lower_limit,
               sampled_cic_df["Bwd IAT Mean"]))
In [67]:
print("Maximum value: ",new_df["Bwd IAT Mean"].max())
print("Minimum value: ",new_df["Bwd IAT Mean"].min())
print("Median value: ",new_df["Bwd IAT Mean"].median())
print("Standard deviation value: ",new_df["Bwd IAT Mean"].std())
print("Upper limit: ",new_df["Bwd IAT Mean"].quantile(0.95))
print("Lower limit: ",new_df["Bwd IAT Mean"].quantile(0.05))
Maximum value:  6501929.0
Minimum value:  0.0
Median value:  647.0
Standard deviation value:  1657361.2
Upper limit:  6501924.774999999
Lower limit:  0.0
In [68]:
plt.hist(new_df["Bwd IAT Mean"], edgecolor="black", log=True)
plt.xlabel("Bwd IAT Mean")
plt.ylabel("Frequency")
plt.title("Sampled data, log transformed histogram after winsorization: Bwd IAT Mean")
plt.show()
No description has been provided for this image
In [69]:
#Plotting both the histograms in parallel for easier comparison
fig1=plt.figure()
ax1=plt.subplot(2,1,1)
ax1.hist(sampled_cic_df["Bwd IAT Mean"], edgecolor="black", log=True)
ax1.set_xlabel("Bwd IAT Mean")
ax1.set_ylabel("Frequency")
ax1.set_title("Prior winsorization")
ax2=plt.subplot(2,1,2)
ax2.hist(new_df["Bwd IAT Mean"], edgecolor="black", log=True)
ax2.set_xlabel("Bwd IAT Mean")
ax2.set_ylabel("Frequency")
ax2.set_title("After winsorization")
plt.tight_layout()
plt.show()
No description has been provided for this image
In [70]:
sampled_cic_df[sampled_cic_df['Bwd IAT Mean'] > 6501929]['Bwd IAT Mean'].count()
Out[70]:
91673
  1. We observed after performing winsorization on sampled data for Bwd IAT Mean, the distribution has changed compared to earlier histogram of sampled dataset's distribution. The main peak on first bin has remain unchanged. There is a new peak observed towards the right hand side of the chart, may be due to winsorization the outlier values have got added to the last bin and as the result its frequency increased.

  2. Median value has remained constant=647

  3. Standard deviation value has reduced from 6192044.5 to 1657361.2

  4. Maximum value in the original sampled dataset is 120000000 and maximum value after winsorization is 6501929. Number of values in original sampled dataset between 6501929 and 120000000 = 91673.

We will see use RobustScaler to transform the features and check the results to compare with the results of Winsorization and understand how both impact the data and handle outliers.

We shall perform Robust Scaling on following 4 features: -

  1. Init Fwd Win Bytes
  2. Init Bwd Win Bytes
  3. Fwd Seg Size Min
  4. Bwd IAT Mean
In [71]:
#Creating a copy of sampled dataset with the 4 features.
robust_scaler_test=sampled_cic_df[['Init Fwd Win Bytes','Init Bwd Win Bytes','Fwd Seg Size Min','Bwd IAT Mean']]
robust_scaler_test.head(10)
robust_scaler_test.reset_index()
Out[71]:
index Init Fwd Win Bytes Init Bwd Win Bytes Fwd Seg Size Min Bwd IAT Mean
0 5968290 219.0 211.0 32.0 27159.000000
1 8285216 63326.0 235.0 20.0 0.000000
2 8349977 8192.0 62856.0 20.0 297148.500000
3 7180832 8192.0 235.0 0.0 0.000000
4 2324438 8192.0 123.0 20.0 6344.799805
... ... ... ... ... ...
1833449 1606912 8192.0 31.0 20.0 15629.142578
1833450 7433839 8192.0 235.0 20.0 0.000000
1833451 2510144 8192.0 16625.0 0.0 0.000000
1833452 760618 279.0 235.0 20.0 0.000000
1833453 7134908 8192.0 235.0 20.0 0.000000

1833454 rows × 5 columns

In [72]:
from sklearn.preprocessing import RobustScaler
In [73]:
robust_scaler=RobustScaler()
robust_scaler.fit(robust_scaler_test)
robust_scaled_data=robust_scaler.transform(robust_scaler_test)
In [74]:
type(robust_scaled_data)
Out[74]:
numpy.ndarray
In [75]:
robust_scaled_data
Out[75]:
array([[-7.97300000e+03, -6.00000000e-01,  1.20000000e+01,
         1.00513250e-01],
       [ 5.51340000e+04,  0.00000000e+00,  0.00000000e+00,
        -2.45292973e-03],
       [ 0.00000000e+00,  1.56552500e+03,  0.00000000e+00,
         1.12410718e+00],
       ...,
       [ 0.00000000e+00,  4.09750000e+02, -2.00000000e+01,
        -2.45292973e-03],
       [-7.91300000e+03,  0.00000000e+00,  0.00000000e+00,
        -2.45292973e-03],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        -2.45292973e-03]])
In [76]:
columns = robust_scaler_test.columns 
# Creating a DataFrame from the scaled data 
robust_scaled_df=pd.DataFrame(data=robust_scaled_data, columns=columns)
In [77]:
robust_scaled_df.head()
Out[77]:
Init Fwd Win Bytes Init Bwd Win Bytes Fwd Seg Size Min Bwd IAT Mean
0 -7973.0 -0.600 12.0 0.100513
1 55134.0 0.000 0.0 -0.002453
2 0.0 1565.525 0.0 1.124107
3 0.0 0.000 -20.0 -0.002453
4 0.0 -2.800 0.0 0.021602
In [78]:
robust_scaled_df.tail()
Out[78]:
Init Fwd Win Bytes Init Bwd Win Bytes Fwd Seg Size Min Bwd IAT Mean
1833449 0.0 -5.10 0.0 0.056801
1833450 0.0 0.00 0.0 -0.002453
1833451 0.0 409.75 -20.0 -0.002453
1833452 -7913.0 0.00 0.0 -0.002453
1833453 0.0 0.00 0.0 -0.002453
In [79]:
fig1=plt.figure()
ax1=plt.subplot(2,1,1)
ax1.hist(sampled_cic_df["Init Fwd Win Bytes"], edgecolor="black", log=True)
ax1.set_xlabel("Init Fwd Win Bytes")
ax1.set_ylabel("Frequency")
ax1.set_title("Prior scaling")
ax2=plt.subplot(2,1,2)
ax2.hist(robust_scaled_df["Init Fwd Win Bytes"], edgecolor="black", log=True)
ax2.set_xlabel("Init Fwd Win Bytes")
ax2.set_ylabel("Frequency")
ax2.set_title("After scaling")
plt.tight_layout()
plt.show()
No description has been provided for this image
In [80]:
print("Maximum value after scaling: ",robust_scaled_df["Init Fwd Win Bytes"].max())
print("Minimum value after scaling: ",robust_scaled_df["Init Fwd Win Bytes"].min())
print("Median value after scaling: ",robust_scaled_df["Init Fwd Win Bytes"].median())
print("Standard deviation value after scaling: ",robust_scaled_df["Init Fwd Win Bytes"].std())
Maximum value after scaling:  57343.0
Minimum value after scaling:  -8192.0
Median value after scaling:  0.0
Standard deviation value after scaling:  17920.85671285462
In [81]:
fig1=plt.figure()
ax1=plt.subplot(2,1,1)
ax1.hist(sampled_cic_df["Init Bwd Win Bytes"], edgecolor="black", log=True)
ax1.set_xlabel("Init Bwd Win Bytes")
ax1.set_ylabel("Frequency")
ax1.set_title("Prior scaling")
ax2=plt.subplot(2,1,2)
ax2.hist(robust_scaled_df["Init Bwd Win Bytes"], edgecolor="black", log=True)
ax2.set_xlabel("Init Bwd Win Bytes")
ax2.set_ylabel("Frequency")
ax2.set_title("After scaling")
plt.tight_layout()
plt.show()
No description has been provided for this image
In [82]:
print("Maximum value after scaling: ",robust_scaled_df["Init Bwd Win Bytes"].max())
print("Minimum value after scaling: ",robust_scaled_df["Init Bwd Win Bytes"].min())
print("Median value after scaling: ",robust_scaled_df["Init Bwd Win Bytes"].median())
print("Standard deviation value after scaling: ",robust_scaled_df["Init Bwd Win Bytes"].std())
Maximum value after scaling:  1632.5
Minimum value after scaling:  -5.875
Median value after scaling:  0.0
Standard deviation value after scaling:  485.3711778160208
In [83]:
fig1=plt.figure()
ax1=plt.subplot(2,1,1)
ax1.hist(sampled_cic_df["Fwd Seg Size Min"], edgecolor="black", log=True)
ax1.set_xlabel("Fwd Seg Size Min")
ax1.set_ylabel("Frequency")
ax1.set_title("Prior scaling")
ax2=plt.subplot(2,1,2)
ax2.hist(robust_scaled_df["Fwd Seg Size Min"], edgecolor="black", log=True)
ax2.set_xlabel("Fwd Seg Size Min")
ax2.set_ylabel("Frequency")
ax2.set_title("After scaling")
plt.tight_layout()
plt.show()
No description has been provided for this image
In [84]:
print("Maximum value after scaling: ",robust_scaled_df["Fwd Seg Size Min"].max())
print("Minimum value after scaling: ",robust_scaled_df["Fwd Seg Size Min"].min())
print("Median value after scaling: ",robust_scaled_df["Fwd Seg Size Min"].median())
print("Standard deviation value after scaling: ",robust_scaled_df["Fwd Seg Size Min"].std())
Maximum value after scaling:  1460.0
Minimum value after scaling:  -20.0
Median value after scaling:  0.0
Standard deviation value after scaling:  25.971709364062733
In [85]:
fig1=plt.figure()
ax1=plt.subplot(2,1,1)
ax1.hist(sampled_cic_df["Bwd IAT Mean"], edgecolor="black", log=True)
ax1.set_xlabel("Bwd IAT Mean")
ax1.set_ylabel("Frequency")
ax1.set_title("Prior scaling")
ax2=plt.subplot(2,1,2)
ax2.hist(robust_scaled_df["Bwd IAT Mean"], edgecolor="black", log=True)
ax2.set_xlabel("Bwd IAT Mean")
ax2.set_ylabel("Frequency")
ax2.set_title("After scaling")
plt.tight_layout()
plt.show()
No description has been provided for this image
In [86]:
print("Maximum value after scaling: ",robust_scaled_df["Bwd IAT Mean"].max())
print("Minimum value after scaling: ",robust_scaled_df["Bwd IAT Mean"].min())
print("Median value after scaling: ",robust_scaled_df["Bwd IAT Mean"].median())
print("Standard deviation value after scaling: ",robust_scaled_df["Bwd IAT Mean"].std())
Maximum value after scaling:  454.94587429990975
Minimum value after scaling:  -0.002452929730979813
Median value after scaling:  0.0
Standard deviation value after scaling:  23.4755026768653

In RobustScaler, we observed the model transforms data such that it generates negative values. It may be due to skewed nature of the features.

However, in our model, we cannot use negative values.

Thus, we shall not employ RobustScaler to scale data for handling outliers in features.

In [87]:
#Performing imputation on all features except for the above 4 features.
features_to_impute=sampled_cic_df.columns.tolist()
print(features_to_impute)
features_to_impute.remove('Init Fwd Win Bytes')
features_to_impute.remove('Init Bwd Win Bytes')
features_to_impute.remove('Fwd Seg Size Min')
features_to_impute.remove('Bwd IAT Mean')
features_to_impute.remove('ClassLabel')
features_to_impute.remove('isMalicious')
#print(features_to_impute)
['Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Fwd Packets Length Total', 'Bwd Packets Length Total', 'Fwd Packet Length Max', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Packet Length Max', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'SYN Flag Count', 'URG Flag Count', 'Avg Packet Size', 'Avg Fwd Segment Size', 'Avg Bwd Segment Size', 'Subflow Fwd Packets', 'Subflow Fwd Bytes', 'Subflow Bwd Packets', 'Subflow Bwd Bytes', 'Init Fwd Win Bytes', 'Init Bwd Win Bytes', 'Fwd Act Data Packets', 'Fwd Seg Size Min', 'Active Mean', 'Active Std', 'Active Max', 'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min', 'ClassLabel', 'isMalicious']
In [88]:
imputed_sample_df=sampled_cic_df.copy()
In [89]:
imputed_sample_df.head()
Out[89]:
Flow Duration Total Fwd Packets Total Backward Packets Fwd Packets Length Total Bwd Packets Length Total Fwd Packet Length Max Fwd Packet Length Mean Fwd Packet Length Std Bwd Packet Length Max Bwd Packet Length Mean ... Active Mean Active Std Active Max Active Min Idle Mean Idle Std Idle Max Idle Min ClassLabel isMalicious
5968290 3813760.0 5 3 935.0 397.0 935.0 187.000 418.144714 397.0 132.333328 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign 0
8285216 88077383.0 2 0 0.0 0.0 0.0 0.000 0.000000 0.0 0.000000 ... 0.0 0.0 0.0 0.0 88077384.0 0.0 88077383.0 88077383.0 Benign 0
8349977 1914354.0 8 7 1144.0 1581.0 677.0 143.000 227.969925 1173.0 225.857147 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign 0
7180832 4002.0 6 0 2064.0 0.0 440.0 344.000 148.722565 0.0 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 DDoS 1
2324438 5368715.0 8 6 355.0 3292.0 198.0 44.375 75.864426 1460.0 548.666687 ... 102340.0 0.0 102340.0 102340.0 5266340.0 0.0 5266340.0 5266340.0 Benign 0

5 rows × 59 columns

In [90]:
for col in features_to_impute:
    Q1 = imputed_sample_df[col].quantile(0.25)
    Q3 = imputed_sample_df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Replacing outliers with the median
    median_value = imputed_sample_df[col].median()
    imputed_sample_df[col] = np.where((imputed_sample_df[col] < lower_bound) | (imputed_sample_df[col] > upper_bound), median_value, imputed_sample_df[col])
In [91]:
imputed_sample_df.head(10)
Out[91]:
Flow Duration Total Fwd Packets Total Backward Packets Fwd Packets Length Total Bwd Packets Length Total Fwd Packet Length Max Fwd Packet Length Mean Fwd Packet Length Std Bwd Packet Length Max Bwd Packet Length Mean ... Active Mean Active Std Active Max Active Min Idle Mean Idle Std Idle Max Idle Min ClassLabel isMalicious
5968290 3813760.0 5.0 3.0 935.0 397.0 935.0 187.000 418.144714 397.0 132.333328 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign 0
8285216 397660.5 2.0 0.0 0.0 0.0 0.0 0.000 0.000000 0.0 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign 0
8349977 1914354.0 8.0 7.0 1144.0 1581.0 677.0 143.000 227.969925 1173.0 225.857147 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign 0
7180832 4002.0 6.0 0.0 2064.0 0.0 440.0 44.000 148.722565 0.0 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 DDoS 1
2324438 5368715.0 8.0 6.0 355.0 232.0 198.0 44.375 75.864426 1460.0 108.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign 0
8791292 24810.0 1.0 1.0 35.0 51.0 35.0 35.000 0.000000 51.0 51.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign 0
2785928 84462.0 1.0 1.0 48.0 48.0 48.0 48.000 0.000000 48.0 48.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign 0
2360459 397660.5 3.0 0.0 0.0 0.0 0.0 0.000 0.000000 0.0 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign 0
6152798 45697.0 3.0 0.0 0.0 0.0 0.0 0.000 0.000000 0.0 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 DoS 1
1899583 80938.0 4.0 3.0 436.0 788.0 436.0 109.000 218.000000 788.0 262.666656 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Benign 0

10 rows × 59 columns

In [92]:
for col in features_to_impute:
    print("Feature name: ",col)
    print("Mean: ",imputed_sample_df[col].mean())
    print("Median: ",imputed_sample_df[col].median())
    print("Maximum: ",imputed_sample_df[col].max())
    print("Minimum: ",imputed_sample_df[col].min())
    print("Standard deviation: ",imputed_sample_df[col].std())
    print("\n")
Feature name:  Flow Duration
Mean:  1213665.5297534054
Median:  397660.25
Maximum:  13875506.0
Minimum:  1.0
Standard deviation:  2159655.192222251


Feature name:  Total Fwd Packets
Mean:  3.9682713610486
Median:  3.0
Maximum:  14.0
Minimum:  0.0
Standard deviation:  3.0508858389290645


Feature name:  Total Backward Packets
Mean:  2.7108163062722053
Median:  2.0
Maximum:  11.0
Minimum:  0.0
Standard deviation:  2.510281941244621


Feature name:  Fwd Packets Length Total
Mean:  395.65305265362537
Median:  97.0
Maximum:  2292.0
Minimum:  0.0
Standard deviation:  522.316758487006


Feature name:  Bwd Packets Length Total
Mean:  351.8448387578854
Median:  232.0
Maximum:  2410.0
Minimum:  0.0
Standard deviation:  484.0204739171949


Feature name:  Fwd Packet Length Max
Mean:  259.6825521665665
Median:  55.0
Maximum:  1235.0
Minimum:  0.0
Standard deviation:  319.60892275413545


Feature name:  Fwd Packet Length Mean
Mean:  59.869995
Median:  44.0
Maximum:  258.66666
Minimum:  0.0
Standard deviation:  58.978123


Feature name:  Fwd Packet Length Std
Mean:  92.038086
Median:  11.547006
Maximum:  451.74237
Minimum:  0.0
Standard deviation:  130.14272


Feature name:  Bwd Packet Length Max
Mean:  403.9789599302737
Median:  152.0
Maximum:  2410.0
Minimum:  0.0
Standard deviation:  508.52023457970176


Feature name:  Bwd Packet Length Mean
Mean:  108.74675
Median:  108.0
Maximum:  540.9375
Minimum:  0.0
Standard deviation:  103.59248


Feature name:  Bwd Packet Length Std
Mean:  163.04944
Median:  0.0
Maximum:  1013.63153
Minimum:  0.0
Standard deviation:  222.16183


Feature name:  Flow Bytes/s
Mean:  3909.036427263739
Median:  996.359843875
Maximum:  67824.6484698098
Minimum:  0.0
Standard deviation:  9988.548556992235


Feature name:  Flow Packets/s
Mean:  66.50042504069462
Median:  16.13210581515
Maximum:  1242.015625
Minimum:  0.016667174321018
Standard deviation:  172.02090506062973


Feature name:  Flow IAT Mean
Mean:  176059.79187172325
Median:  82687.748046875
Maximum:  1968061.25
Minimum:  0.3333333432674408
Standard deviation:  326493.8396342


Feature name:  Flow IAT Std
Mean:  210712.12
Median:  18652.17
Maximum:  2091517.5
Minimum:  0.0
Standard deviation:  469940.1


Feature name:  Flow IAT Max
Mean:  1481385.0418025213
Median:  225017.25
Maximum:  12737357.0
Minimum:  1.0
Standard deviation:  2794362.274549001


Feature name:  Flow IAT Min
Mean:  50.33824410102462
Median:  14.0
Maximum:  1170.0
Minimum:  0.0
Standard deviation:  149.01435145238835


Feature name:  Fwd IAT Total
Mean:  892652.4578448655
Median:  72909.25
Maximum:  11725868.0
Minimum:  0.0
Standard deviation:  1875030.9235689652


Feature name:  Fwd IAT Mean
Mean:  182948.53978239992
Median:  28688.02490234375
Maximum:  2684606.0
Minimum:  0.0
Standard deviation:  410226.2879792003


Feature name:  Fwd IAT Std
Mean:  46698.184
Median:  454.50018
Maximum:  999041.94
Minimum:  0.0
Standard deviation:  130210.86


Feature name:  Fwd IAT Max
Mean:  1212594.4175294281
Median:  61635.0
Maximum:  10556757.0
Minimum:  0.0
Standard deviation:  2622169.341081965


Feature name:  Fwd IAT Min
Mean:  65.34727787007473
Median:  36.0
Maximum:  1144.0
Minimum:  0.0
Standard deviation:  131.08000116496532


Feature name:  Bwd IAT Total
Mean:  203173.0466682011
Median:  732.0
Maximum:  3138539.0
Minimum:  0.0
Standard deviation:  534712.8396043193


Feature name:  Bwd IAT Std
Mean:  45497.36
Median:  0.0
Maximum:  705338.44
Minimum:  0.0
Standard deviation:  123579.555


Feature name:  Bwd IAT Max
Mean:  162097.3564518117
Median:  709.0
Maximum:  2382694.0
Minimum:  0.0
Standard deviation:  390905.02938092116


Feature name:  Bwd IAT Min
Mean:  36.23238488666746
Median:  3.0
Maximum:  767.0
Minimum:  0.0
Standard deviation:  113.35976821326203


Feature name:  Fwd PSH Flags
Mean:  0.0
Median:  0.0
Maximum:  0.0
Minimum:  0.0
Standard deviation:  0.0


Feature name:  Fwd Header Length
Mean:  94.90806205118864
Median:  72.0
Maximum:  360.0
Minimum:  0.0
Standard deviation:  79.50237485349817


Feature name:  Bwd Header Length
Mean:  68.0338039569032
Median:  60.0
Maximum:  328.0
Minimum:  0.0
Standard deviation:  67.83078348929288


Feature name:  Fwd Packets/s
Mean:  35.18084
Median:  8.57913
Maximum:  671.32117
Minimum:  0.0
Standard deviation:  90.893295


Feature name:  Bwd Packets/s
Mean:  11.960576
Median:  3.3275118
Maximum:  192.61328
Minimum:  0.0
Standard deviation:  25.856539


Feature name:  Packet Length Max
Mean:  488.07187526930045
Median:  232.0
Maximum:  2341.0
Minimum:  0.0
Standard deviation:  519.0516918125927


Feature name:  Packet Length Mean
Mean:  83.624954
Median:  78.666664
Maximum:  342.30768
Minimum:  0.0
Standard deviation:  68.9974


Feature name:  Packet Length Std
Mean:  155.90248
Median:  73.90083
Maximum:  785.4239
Minimum:  0.0
Standard deviation:  176.20068


Feature name:  Packet Length Variance
Mean:  36721.277
Median:  5461.3335
Maximum:  254975.14
Minimum:  0.0
Standard deviation:  54679.88


Feature name:  SYN Flag Count
Mean:  0.0
Median:  0.0
Maximum:  0.0
Minimum:  0.0
Standard deviation:  0.0


Feature name:  URG Flag Count
Mean:  0.0
Median:  0.0
Maximum:  0.0
Minimum:  0.0
Standard deviation:  0.0


Feature name:  Avg Packet Size
Mean:  97.50851
Median:  99.5
Maximum:  372.66666
Minimum:  0.0
Standard deviation:  75.065735


Feature name:  Avg Fwd Segment Size
Mean:  59.869995
Median:  44.0
Maximum:  258.66666
Minimum:  0.0
Standard deviation:  58.978123


Feature name:  Avg Bwd Segment Size
Mean:  108.74675
Median:  108.0
Maximum:  540.9375
Minimum:  0.0
Standard deviation:  103.59248


Feature name:  Subflow Fwd Packets
Mean:  3.9682713610486
Median:  3.0
Maximum:  14.0
Minimum:  0.0
Standard deviation:  3.0508858389290645


Feature name:  Subflow Fwd Bytes
Mean:  395.65305265362537
Median:  97.0
Maximum:  2292.0
Minimum:  0.0
Standard deviation:  522.316758487006


Feature name:  Subflow Bwd Packets
Mean:  2.7108163062722053
Median:  2.0
Maximum:  11.0
Minimum:  0.0
Standard deviation:  2.510281941244621


Feature name:  Subflow Bwd Bytes
Mean:  351.8448387578854
Median:  232.0
Maximum:  2410.0
Minimum:  0.0
Standard deviation:  484.0204739171949


Feature name:  Fwd Act Data Packets
Mean:  1.7136301210720313
Median:  1.0
Maximum:  10.0
Minimum:  0.0
Standard deviation:  2.1627014579408916


Feature name:  Active Mean
Mean:  0.0
Median:  0.0
Maximum:  0.0
Minimum:  0.0
Standard deviation:  0.0


Feature name:  Active Std
Mean:  0.0
Median:  0.0
Maximum:  0.0
Minimum:  0.0
Standard deviation:  0.0


Feature name:  Active Max
Mean:  0.0
Median:  0.0
Maximum:  0.0
Minimum:  0.0
Standard deviation:  0.0


Feature name:  Active Min
Mean:  0.0
Median:  0.0
Maximum:  0.0
Minimum:  0.0
Standard deviation:  0.0


Feature name:  Idle Mean
Mean:  0.0
Median:  0.0
Maximum:  0.0
Minimum:  0.0
Standard deviation:  0.0


Feature name:  Idle Std
Mean:  0.0
Median:  0.0
Maximum:  0.0
Minimum:  0.0
Standard deviation:  0.0


Feature name:  Idle Max
Mean:  0.0
Median:  0.0
Maximum:  0.0
Minimum:  0.0
Standard deviation:  0.0


Feature name:  Idle Min
Mean:  0.0
Median:  0.0
Maximum:  0.0
Minimum:  0.0
Standard deviation:  0.0


We shall perform handling of outliers on the main dataset after testing the approaches on different features of sample dataset.

  • For following 4 features, we shall perform Winsorization: -

    1. Init Fwd Win Bytes

    2. Init Bwd Win Bytes

    3. Fwd Seg Size Min

    4. Bwd IAT Mean

      • These 4 features have higher number of outliers, most of them being labelled as Benign.
      • Thus, by winsorization, we try to reduce the influence of outliers by handling extreme values.
      • Outliers also may indicate noisy data.
      • Since the above 4 features have large number of outliers, there is higher likelihood for occurrence of noisy data in those features.
      • Thus, winsorization will enable us to reduce the noise in the above features.
  • For remaining features, we shall perform imputation with median value.

    • Since the number of outliers in these features are very less, imputing them with median value will help to approximate the entries having outliers.
    • Most of our features are skewed, as the result, we perform imputation of outliers with respective Median values.

This process will help us ensure that all outliers are handled, and we do not delete any rows causing loss of data.

In [93]:
#Imputation of outliers with median
for col in features_to_impute:
    Q1 = cic_df[col].quantile(0.25)
    Q3 = cic_df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Replacing outliers with the median
    median_value = cic_df[col].median()
    cic_df[col] = np.where((cic_df[col] < lower_bound) | (cic_df[col] > upper_bound), median_value, cic_df[col])
In [94]:
#Winsorization of outliers
features_to_cap=['Init Fwd Win Bytes','Init Bwd Win Bytes','Fwd Seg Size Min','Bwd IAT Mean']
for col in features_to_cap:
    upper_limit=cic_df[col].quantile(0.95)
    lower_limit=cic_df[col].quantile(0.05)
    cic_df[col]=np.where(cic_df[col]>=upper_limit,
                upper_limit,
               np.where(cic_df[col]<=lower_limit,
               lower_limit,
               cic_df[col]))
In [95]:
del sampled_cic_df
In [96]:
sample_size=int(0.2*len(cic_df))
sampled_cic_df=cic_df.sample(n=sample_size, replace=False, random_state=42)
sampled_cic_df.shape
Out[96]:
(1833454, 59)
In [97]:
#Plotting on sampled dataset with log scale

# Getting the list of columns excluding 'isMalicious' and 'ClassLabel'
columns = [col for col in cic_df.columns if col not in ['isMalicious', 'ClassLabel']]

# Creating subplots for each column
fig, axes = plt.subplots(nrows=len(columns), ncols=1, figsize=(10, len(columns) * 5))

# Plot each column's histogram in a separate subplot
for i, column in enumerate(columns):
    sampled_cic_df[column].hist(bins=50, ax=axes[i], log=True)
    axes[i].set_xlabel("Values")
    axes[i].set_ylabel("Frequency w.r.t log scale")
    axes[i].set_title(column)

plt.tight_layout()
plt.show()
No description has been provided for this image
  • In the above histograms, we observed there are some features with single value.
  • Such features will not help us train the classifier because irrespective of type of event, those feature values will remain unchanged.
In [98]:
unique_columns=[col for col in sampled_cic_df.columns if sampled_cic_df[col].nunique() == 1]
print("Features with single value: ",unique_columns)
Features with single value:  ['Fwd PSH Flags', 'SYN Flag Count', 'URG Flag Count', 'Active Mean', 'Active Std', 'Active Max', 'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min']
In [99]:
#We will check if the above list remains same for main dataset.
unique_columns=[col for col in cic_df.columns if cic_df[col].nunique() == 1]
print("Features with single value: ",unique_columns)
Features with single value:  ['Fwd PSH Flags', 'SYN Flag Count', 'URG Flag Count', 'Active Mean', 'Active Std', 'Active Max', 'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min']
In [100]:
#We shall drop the above features from our dataset as they will not help in training our model.
sampled_cic_df=sampled_cic_df.drop(unique_columns,axis=1)
cic_df=cic_df.drop(unique_columns,axis=1)
In [101]:
print("Shape of main dataset: ",cic_df.shape)
print("Shape of sampled dataset: ",sampled_cic_df.shape)
Shape of main dataset:  (9167271, 48)
Shape of sampled dataset:  (1833454, 48)
In [102]:
sampled_cic_df.head()
Out[102]:
Flow Duration Total Fwd Packets Total Backward Packets Fwd Packets Length Total Bwd Packets Length Total Fwd Packet Length Max Fwd Packet Length Mean Fwd Packet Length Std Bwd Packet Length Max Bwd Packet Length Mean ... Subflow Fwd Packets Subflow Fwd Bytes Subflow Bwd Packets Subflow Bwd Bytes Init Fwd Win Bytes Init Bwd Win Bytes Fwd Act Data Packets Fwd Seg Size Min ClassLabel isMalicious
5968290 3813760.0 5.0 3.0 935.0 397.0 935.0 187.000 418.144714 397.0 132.333328 ... 5.0 935.0 3.0 397.0 219.0 211.0 1.0 32.0 Benign 0
8285216 396839.0 2.0 0.0 0.0 0.0 0.0 0.000 0.000000 0.0 0.000000 ... 2.0 0.0 0.0 0.0 63326.0 235.0 0.0 20.0 Benign 0
8349977 1914354.0 8.0 7.0 1144.0 1581.0 677.0 143.000 227.969925 1173.0 225.857147 ... 8.0 1144.0 7.0 1581.0 8192.0 62856.0 5.0 20.0 Benign 0
7180832 4002.0 6.0 0.0 2064.0 0.0 440.0 44.000 148.722565 0.0 0.000000 ... 6.0 2064.0 0.0 0.0 8192.0 235.0 5.0 8.0 DDoS 1
2324438 5368715.0 8.0 6.0 355.0 232.0 198.0 44.375 75.864426 1460.0 108.000000 ... 8.0 355.0 6.0 232.0 8192.0 123.0 3.0 20.0 Benign 0

5 rows × 48 columns

In [103]:
sampled_cic_df.tail()
Out[103]:
Flow Duration Total Fwd Packets Total Backward Packets Fwd Packets Length Total Bwd Packets Length Total Fwd Packet Length Max Fwd Packet Length Mean Fwd Packet Length Std Bwd Packet Length Max Bwd Packet Length Mean ... Subflow Fwd Packets Subflow Fwd Bytes Subflow Bwd Packets Subflow Bwd Bytes Init Fwd Win Bytes Init Bwd Win Bytes Fwd Act Data Packets Fwd Seg Size Min ClassLabel isMalicious
1606912 189583.0 10.0 8.0 496.0 232.0 192.0 49.599998 77.654793 1460.0 108.0 ... 10.0 496.0 8.0 232.0 8192.0 31.0 4.0 20.0 Benign 0
7433839 3000787.0 4.0 0.0 2064.0 0.0 516.0 44.000000 0.000000 0.0 0.0 ... 4.0 2064.0 0.0 0.0 8192.0 235.0 3.0 20.0 DDoS 1
2510144 40.0 1.0 1.0 0.0 0.0 0.0 0.000000 0.000000 0.0 0.0 ... 1.0 0.0 1.0 0.0 8192.0 16625.0 0.0 8.0 Benign 0
760618 396839.0 2.0 0.0 0.0 0.0 0.0 0.000000 0.000000 0.0 0.0 ... 2.0 0.0 0.0 0.0 279.0 235.0 0.0 20.0 Benign 0
7134908 4097.0 3.0 0.0 97.0 0.0 440.0 44.000000 27.430199 0.0 0.0 ... 3.0 97.0 0.0 0.0 8192.0 235.0 1.0 20.0 DDoS 1

5 rows × 48 columns

In [104]:
import math
In [105]:
#Creating a copy of sampled dataset to plot pyramid chart
pyramid_sampled_df=sampled_cic_df.copy()
In [106]:
pyramid_sampled_df.shape
Out[106]:
(1833454, 48)
In [107]:
pyramid_sampled_df=pyramid_sampled_df.drop('ClassLabel',axis=1)
In [108]:
pyramid_sampled_df.shape
Out[108]:
(1833454, 47)
In [109]:
n=pyramid_sampled_df.shape[0]
print("n: ",n)
n:  1833454
In [110]:
column_list=pyramid_sampled_df.columns.tolist()
In [111]:
for col in column_list:
    try:
        p25, p75 = np.percentile(pyramid_sampled_df[col], [25, 75])
        print("Feature name: ",col)
        print("p25: ",p25)
        print("p75: ",p75)
        width=2.*(p75-p25)/n**(1./3)
        # Creating bins using the calculated width
        pyramid_sampled_df[col] = pd.cut(pyramid_sampled_df[col], bins=np.arange(pyramid_sampled_df[col].min(), pyramid_sampled_df[col].max() + width, width))
        # Grouping data by bins and isMalicious (target festure)
        grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
        # Calculate normalized frequencies for each bin and class
        normalized_data=grouped_data.div(grouped_data.sum(axis=1), axis=0)

        # Create the pyramid chart using matplotlib
        fig, ax = plt.subplots()

        # Plot the bars for each class in opposite directions
        ax.bar(normalized_data.index.astype(str), normalized_data[0], width=0.8, align='center', color='lightblue', label="Benign")
        ax.bar(normalized_data.index.astype(str), -normalized_data[1], width=0.8, align='center', color='lightcoral', label="Malicious")

        # Customize the plot
        ax.set_xlabel(col)
        ax.set_ylabel('Normalized Frequency')
        ax.set_title(col)
        ax.legend()

        # Show the plot
        plt.show()
    except ValueError:
        pass
Feature name:  Flow Duration
p25:  11621.0
p75:  1273497.25
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Total Fwd Packets
p25:  2.0
p75:  5.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Total Backward Packets
p25:  1.0
p75:  4.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Fwd Packets Length Total
p25:  30.0
p75:  858.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Bwd Packets Length Total
p25:  0.0
p75:  338.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Fwd Packet Length Max
p25:  20.0
p75:  440.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Fwd Packet Length Mean
p25:  7.0
p75:  91.2727279663086
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Fwd Packet Length Std
p25:  0.0
p75:  168.00892639160156
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Bwd Packet Length Max
p25:  0.0
p75:  859.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Bwd Packet Length Mean
p25:  0.0
p75:  161.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Bwd Packet Length Std
p25:  0.0
p75:  284.55936431884766
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Flow Bytes/s
p25:  55.825086049175
p75:  1456.5540008269675
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Flow Packets/s
p25:  1.46565167235
p75:  32.86284691
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Flow IAT Mean
p25:  2578.0
p75:  148040.30859375
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Flow IAT Std
p25:  0.0
p75:  112470.814453125
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Flow IAT Max
p25:  10618.0
p75:  984370.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Flow IAT Min
p25:  3.0
p75:  21.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Fwd IAT Total
p25:  285.0
p75:  323869.75
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Fwd IAT Mean
p25:  136.0
p75:  81006.75
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Fwd IAT Std
p25:  0.0
p75:  2294.5615234375
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
C:\Users\pc\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\pylabtools.py:152: UserWarning: Creating legend with loc="best" can be slow with large amounts of data.
  fig.canvas.print_figure(bytes_io, **kw)
No description has been provided for this image
Feature name:  Fwd IAT Max
p25:  206.0
p75:  997940.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Fwd IAT Min
p25:  2.0
p75:  46.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Bwd IAT Total
p25:  0.0
p75:  25579.5
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Bwd IAT Mean
p25:  0.0
p75:  263766.21875
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Bwd IAT Std
p25:  0.0
p75:  4926.3211669921875
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Bwd IAT Max
p25:  0.0
p75:  36136.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Bwd IAT Min
p25:  0.0
p75:  4.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Fwd Header Length
p25:  40.0
p75:  136.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Bwd Header Length
p25:  8.0
p75:  104.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Fwd Packets/s
p25:  0.878700390458107
p75:  17.483688831329346
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Bwd Packets/s
p25:  0.14235177636146545
p75:  4.386377453804016
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Packet Length Max
p25:  46.0
p75:  935.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Packet Length Mean
p25:  30.75
p75:  137.82608032226562
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Packet Length Std
p25:  8.763561248779297
p75:  317.73126220703125
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Packet Length Variance
p25:  76.80000305175781
p75:  89114.359375
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Avg Packet Size
p25:  41.0
p75:  151.09524536132812
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Avg Fwd Segment Size
p25:  7.0
p75:  91.2727279663086
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Avg Bwd Segment Size
p25:  0.0
p75:  161.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Subflow Fwd Packets
p25:  2.0
p75:  5.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Subflow Fwd Bytes
p25:  30.0
p75:  858.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Subflow Bwd Packets
p25:  1.0
p75:  4.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Subflow Bwd Bytes
p25:  0.0
p75:  338.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Init Fwd Win Bytes
p25:  8192.0
p75:  8192.0
Feature name:  Init Bwd Win Bytes
p25:  219.0
p75:  259.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:9: RuntimeWarning: divide by zero encountered in double_scalars
  pyramid_sampled_df[col] = pd.cut(pyramid_sampled_df[col], bins=np.arange(pyramid_sampled_df[col].min(), pyramid_sampled_df[col].max() + width, width))
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
C:\Users\pc\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\pylabtools.py:152: UserWarning: Creating legend with loc="best" can be slow with large amounts of data.
  fig.canvas.print_figure(bytes_io, **kw)
No description has been provided for this image
Feature name:  Fwd Act Data Packets
p25:  0.0
p75:  2.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
No description has been provided for this image
Feature name:  Fwd Seg Size Min
p25:  20.0
p75:  20.0
Feature name:  isMalicious
p25:  0.0
p75:  0.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:9: RuntimeWarning: divide by zero encountered in double_scalars
  pyramid_sampled_df[col] = pd.cut(pyramid_sampled_df[col], bins=np.arange(pyramid_sampled_df[col].min(), pyramid_sampled_df[col].max() + width, width))
  • We selected Freedman-Diaconis Rule for computing the number of bins for each feature to plot the charts.

  • The above rule helps to compute bin width based on each feature's IQR. Thus, it helps to reduce the impact of skewness in data, does not assume the feature to be normally distributed. Since it uses IQR, it is effecitively handles potential outliers in the data.

  • Following features have almost equal number of Malicious and Benign records in most of the bins: -

    1. Flow Duration
    2. Flow IAT Max
    3. Fwd Header Length
  • Following features have some bins where number of Malicious records are relatively more then number of Benign records, and we observed change in pattern: -

    1. Flow Bytes/s
    2. Flow Packets/s
    3. Flow IAT Std
    4. Fwd IAT Max
    5. Bwd IAT Std
    6. Bwd IAT Max
    7. Fwd Packets/s
    8. Bwd Packets/s
  • "Init Bwd Win Bytes" was a rare feature which had only 1 bin with Malicious records, rest all bins had Benign records.

  • Remaining all features have relatively very high number of Benign records compared to Malicious records in most of the bins.

  • While carrying out the interpretation, small variations and changes were not recorded, as decisions based on minor interpretations may result in incorrect analysis. Only the patterns which are thick and broadly visible were recorded from the above Pyramid charts plotted with respect to the binary target feature: isMalicious.

In [112]:
#Encoding the values in field: ClassLabel and thus, we will get a unique numerical identifier for each type of attack
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
sampled_cic_df["attack_id"]=le.fit_transform(sampled_cic_df["ClassLabel"])
print("Attack id: ",sampled_cic_df["attack_id"].unique())
                                            
Attack id:  [0 3 4 5 1 2 6 7]
In [113]:
sampled_cic_df.head()
Out[113]:
Flow Duration Total Fwd Packets Total Backward Packets Fwd Packets Length Total Bwd Packets Length Total Fwd Packet Length Max Fwd Packet Length Mean Fwd Packet Length Std Bwd Packet Length Max Bwd Packet Length Mean ... Subflow Fwd Bytes Subflow Bwd Packets Subflow Bwd Bytes Init Fwd Win Bytes Init Bwd Win Bytes Fwd Act Data Packets Fwd Seg Size Min ClassLabel isMalicious attack_id
5968290 3813760.0 5.0 3.0 935.0 397.0 935.0 187.000 418.144714 397.0 132.333328 ... 935.0 3.0 397.0 219.0 211.0 1.0 32.0 Benign 0 0
8285216 396839.0 2.0 0.0 0.0 0.0 0.0 0.000 0.000000 0.0 0.000000 ... 0.0 0.0 0.0 63326.0 235.0 0.0 20.0 Benign 0 0
8349977 1914354.0 8.0 7.0 1144.0 1581.0 677.0 143.000 227.969925 1173.0 225.857147 ... 1144.0 7.0 1581.0 8192.0 62856.0 5.0 20.0 Benign 0 0
7180832 4002.0 6.0 0.0 2064.0 0.0 440.0 44.000 148.722565 0.0 0.000000 ... 2064.0 0.0 0.0 8192.0 235.0 5.0 8.0 DDoS 1 3
2324438 5368715.0 8.0 6.0 355.0 232.0 198.0 44.375 75.864426 1460.0 108.000000 ... 355.0 6.0 232.0 8192.0 123.0 3.0 20.0 Benign 0 0

5 rows × 49 columns

In [114]:
label_mapping = dict(zip(le.classes_, le.transform(le.classes_))) 
print("Attack id of each distinct value in field ClassLabel:", label_mapping)
Attack id of each distinct value in field ClassLabel: {'Benign': 0, 'Botnet': 1, 'Bruteforce': 2, 'DDoS': 3, 'DoS': 4, 'Infiltration': 5, 'Portscan': 6, 'Webattack': 7}
In [115]:
corr_sample_size=int(0.2*len(sampled_cic_df))
corr_df=sampled_cic_df.sample(n=corr_sample_size, replace=False, random_state=42)
In [116]:
type(corr_df)
Out[116]:
pandas.core.frame.DataFrame
In [117]:
#Dropping the columns: ClassLabel, isMalicious from the dataframe
corr_df=corr_df.drop(['isMalicious','ClassLabel'],axis=1)
In [118]:
corr_df.dtypes
Out[118]:
Flow Duration               float64
Total Fwd Packets           float64
Total Backward Packets      float64
Fwd Packets Length Total    float64
Bwd Packets Length Total    float64
Fwd Packet Length Max       float64
Fwd Packet Length Mean      float32
Fwd Packet Length Std       float32
Bwd Packet Length Max       float64
Bwd Packet Length Mean      float32
Bwd Packet Length Std       float32
Flow Bytes/s                float64
Flow Packets/s              float64
Flow IAT Mean               float64
Flow IAT Std                float32
Flow IAT Max                float64
Flow IAT Min                float64
Fwd IAT Total               float64
Fwd IAT Mean                float64
Fwd IAT Std                 float32
Fwd IAT Max                 float64
Fwd IAT Min                 float64
Bwd IAT Total               float64
Bwd IAT Mean                float32
Bwd IAT Std                 float32
Bwd IAT Max                 float64
Bwd IAT Min                 float64
Fwd Header Length           float64
Bwd Header Length           float64
Fwd Packets/s               float32
Bwd Packets/s               float32
Packet Length Max           float64
Packet Length Mean          float32
Packet Length Std           float32
Packet Length Variance      float32
Avg Packet Size             float32
Avg Fwd Segment Size        float32
Avg Bwd Segment Size        float32
Subflow Fwd Packets         float64
Subflow Fwd Bytes           float64
Subflow Bwd Packets         float64
Subflow Bwd Bytes           float64
Init Fwd Win Bytes          float64
Init Bwd Win Bytes          float64
Fwd Act Data Packets        float64
Fwd Seg Size Min            float64
attack_id                     int32
dtype: object
In [119]:
corr_df.shape
Out[119]:
(366690, 47)
In [120]:
corr_df.head()
Out[120]:
Flow Duration Total Fwd Packets Total Backward Packets Fwd Packets Length Total Bwd Packets Length Total Fwd Packet Length Max Fwd Packet Length Mean Fwd Packet Length Std Bwd Packet Length Max Bwd Packet Length Mean ... Avg Bwd Segment Size Subflow Fwd Packets Subflow Fwd Bytes Subflow Bwd Packets Subflow Bwd Bytes Init Fwd Win Bytes Init Bwd Win Bytes Fwd Act Data Packets Fwd Seg Size Min attack_id
3357734 13523961.0 2.0 0.0 0.0 0.0 0.0 0.000000 0.000000 0.0 0.0 ... 0.0 2.0 0.0 0.0 0.0 2049.0 235.0 0.0 20.0 3
7460867 20572.0 2.0 2.0 90.0 172.0 45.0 45.000000 0.000000 86.0 86.0 ... 86.0 2.0 90.0 2.0 172.0 8192.0 235.0 1.0 20.0 0
2952728 1374475.0 3.0 4.0 20.0 964.0 20.0 6.666667 11.547006 964.0 241.0 ... 241.0 3.0 20.0 4.0 964.0 8192.0 211.0 1.0 20.0 3
7243433 84231.0 4.0 2.0 168.0 414.0 42.0 42.000000 0.000000 207.0 207.0 ... 207.0 4.0 168.0 2.0 414.0 8192.0 235.0 3.0 20.0 0
4133394 1378.0 5.0 2.0 935.0 309.0 935.0 187.000000 418.144714 309.0 154.5 ... 154.5 5.0 935.0 2.0 309.0 65535.0 32768.0 1.0 20.0 0

5 rows × 47 columns

In [121]:
# Computing the correlation matrix
corr_matrix = corr_df.corr()

fig, ax = plt.subplots(figsize=(40, 40))

# Creating the heatmap
cax = ax.matshow(corr_matrix, cmap="coolwarm")

# Adding color bar
fig.colorbar(cax)

# Setting ticks and labels
ax.set_xticks(np.arange(len(corr_df.columns)))
ax.set_yticks(np.arange(len(corr_df.columns)))
ax.set_xticklabels(corr_df.columns, rotation=90)
ax.set_yticklabels(corr_df.columns)

# Adding the correlation coefficients as text on the heatmap
for i in range(len(corr_df.columns)):
    for j in range(len(corr_df.columns)):
        text = ax.text(j, i, round(corr_matrix.iloc[i, j], 2),
                       ha="center", va="center", color="black", fontsize=10)

plt.title("Correlation Matrix Heatmap w.r.t attack_id", fontsize=50)
plt.show()
No description has been provided for this image
  • If we take the original sampled dataset: sampled_cic_df which was compiled by taking 20% of records from original dataset, and try to plot the correlation matrix as above, we get error for insufficient memory.

  • As the result, from sampled dataset, we took another sample: corr_df by taking 20% of records, and fetched the results for correlation among features and with the target feature: attack_id.

  • From the above results, we observed as all features against the target have blue color squares, all features have weak relation with the target feature.

  • However, there are many independent features having red and dark red squares which indicates there is strong relation among some of the independent features of the sub-sampled dataset.

  • Some of the examples are: -

    • Fwd Packet Length Max - Fwd Packet Length Mean = 0.88
    • Fwd Packet Length Max - Fwd Packet Length Std = 0.91
    • Bwd Packet Length Std - Packet Length Max = 0.91
    • Bwd Packet Length Std = Packet Length Mean = 0.71
In [122]:
stdev=sampled_cic_df.std(numeric_only=True)
zero_std_cols= stdev[stdev == 0].index.tolist()
print("Features with zero standard deviation in sample dataset: ",zero_std_cols)
Features with zero standard deviation in sample dataset:  []
In [123]:
print(corr_matrix)
                          Flow Duration  Total Fwd Packets  \
Flow Duration                  1.000000           0.344572   
Total Fwd Packets              0.344572           1.000000   
Total Backward Packets         0.284508           0.693930   
Fwd Packets Length Total       0.266345           0.580110   
Bwd Packets Length Total       0.227557           0.536628   
Fwd Packet Length Max          0.291309           0.535619   
Fwd Packet Length Mean         0.223883           0.443021   
Fwd Packet Length Std          0.258275           0.460252   
Bwd Packet Length Max          0.193896           0.559189   
Bwd Packet Length Mean         0.069250           0.389019   
Bwd Packet Length Std          0.208548           0.526333   
Flow Bytes/s                  -0.168463          -0.056336   
Flow Packets/s                -0.201210          -0.142290   
Flow IAT Mean                  0.527404           0.155158   
Flow IAT Std                   0.565126           0.229385   
Flow IAT Max                   0.530732           0.198171   
Flow IAT Min                  -0.087898          -0.165506   
Fwd IAT Total                  0.780641           0.396678   
Fwd IAT Mean                   0.518293           0.244505   
Fwd IAT Std                    0.262062           0.526483   
Fwd IAT Max                    0.359788           0.225066   
Fwd IAT Min                    0.042262           0.121484   
Bwd IAT Total                  0.124080           0.324204   
Bwd IAT Mean                   0.020241           0.192164   
Bwd IAT Std                    0.206601           0.448569   
Bwd IAT Max                    0.209195           0.410429   
Bwd IAT Min                    0.110107           0.098966   
Fwd Header Length              0.330307           0.806061   
Bwd Header Length              0.294374           0.711120   
Fwd Packets/s                 -0.201649          -0.141910   
Bwd Packets/s                 -0.215691          -0.126428   
Packet Length Max              0.244895           0.591023   
Packet Length Mean             0.160033           0.499634   
Packet Length Std              0.211278           0.500338   
Packet Length Variance         0.183152           0.456418   
Avg Packet Size                0.131517           0.444920   
Avg Fwd Segment Size           0.223883           0.443021   
Avg Bwd Segment Size           0.069250           0.389019   
Subflow Fwd Packets            0.344572           1.000000   
Subflow Fwd Bytes              0.266345           0.580110   
Subflow Bwd Packets            0.284508           0.693930   
Subflow Bwd Bytes              0.227557           0.536628   
Init Fwd Win Bytes            -0.136505           0.028631   
Init Bwd Win Bytes             0.220855           0.541543   
Fwd Act Data Packets           0.245885           0.755914   
Fwd Seg Size Min               0.172877           0.151001   
attack_id                      0.026159          -0.039465   

                          Total Backward Packets  Fwd Packets Length Total  \
Flow Duration                           0.284508                  0.266345   
Total Fwd Packets                       0.693930                  0.580110   
Total Backward Packets                  1.000000                  0.457674   
Fwd Packets Length Total                0.457674                  1.000000   
Bwd Packets Length Total                0.667482                  0.470164   
Fwd Packet Length Max                   0.431439                  0.796363   
Fwd Packet Length Mean                  0.454429                  0.673515   
Fwd Packet Length Std                   0.418142                  0.656346   
Bwd Packet Length Max                   0.626985                  0.547430   
Bwd Packet Length Mean                  0.530979                  0.397051   
Bwd Packet Length Std                   0.589303                  0.512406   
Flow Bytes/s                            0.053830                 -0.001175   
Flow Packets/s                         -0.046278                 -0.068197   
Flow IAT Mean                           0.098510                  0.200397   
Flow IAT Std                            0.192020                  0.312305   
Flow IAT Max                            0.129965                  0.232288   
Flow IAT Min                           -0.113239                 -0.146314   
Fwd IAT Total                           0.272520                  0.350136   
Fwd IAT Mean                            0.145574                  0.293066   
Fwd IAT Std                             0.513079                  0.413410   
Fwd IAT Max                             0.106457                  0.292654   
Fwd IAT Min                             0.181549                  0.101196   
Bwd IAT Total                           0.476214                  0.259034   
Bwd IAT Mean                            0.163481                  0.183088   
Bwd IAT Std                             0.512708                  0.339885   
Bwd IAT Max                             0.521269                  0.281791   
Bwd IAT Min                             0.075412                  0.159474   
Fwd Header Length                       0.615261                  0.570863   
Bwd Header Length                       0.846017                  0.489739   
Fwd Packets/s                          -0.068510                 -0.072082   
Bwd Packets/s                          -0.013203                 -0.022332   
Packet Length Max                       0.596776                  0.680678   
Packet Length Mean                      0.570523                  0.577650   
Packet Length Std                       0.552656                  0.554814   
Packet Length Variance                  0.457982                  0.514811   
Avg Packet Size                         0.537342                  0.535458   
Avg Fwd Segment Size                    0.454429                  0.673515   
Avg Bwd Segment Size                    0.530979                  0.397051   
Subflow Fwd Packets                     0.693930                  0.580110   
Subflow Fwd Bytes                       0.457674                  1.000000   
Subflow Bwd Packets                     1.000000                  0.457674   
Subflow Bwd Bytes                       0.667482                  0.470164   
Init Fwd Win Bytes                     -0.018963                  0.148935   
Init Bwd Win Bytes                      0.501559                  0.477821   
Fwd Act Data Packets                    0.602988                  0.559723   
Fwd Seg Size Min                        0.124126                  0.200562   
attack_id                              -0.064490                 -0.085335   

                          Bwd Packets Length Total  Fwd Packet Length Max  \
Flow Duration                             0.227557               0.291309   
Total Fwd Packets                         0.536628               0.535619   
Total Backward Packets                    0.667482               0.431439   
Fwd Packets Length Total                  0.470164               0.796363   
Bwd Packets Length Total                  1.000000               0.451904   
Fwd Packet Length Max                     0.451904               1.000000   
Fwd Packet Length Mean                    0.433777               0.876929   
Fwd Packet Length Std                     0.401362               0.910410   
Bwd Packet Length Max                     0.642817               0.459193   
Bwd Packet Length Mean                    0.556433               0.387529   
Bwd Packet Length Std                     0.560662               0.475939   
Flow Bytes/s                             -0.043166              -0.002276   
Flow Packets/s                           -0.054143              -0.017606   
Flow IAT Mean                             0.038200               0.206706   
Flow IAT Std                              0.142081               0.403239   
Flow IAT Max                              0.117542               0.256882   
Flow IAT Min                             -0.075916              -0.159033   
Fwd IAT Total                             0.261268               0.383010   
Fwd IAT Mean                              0.119108               0.364603   
Fwd IAT Std                               0.606764               0.324852   
Fwd IAT Max                               0.145319               0.323373   
Fwd IAT Min                               0.152866               0.067656   
Bwd IAT Total                             0.539414               0.198299   
Bwd IAT Mean                              0.119134               0.113974   
Bwd IAT Std                               0.601234               0.289248   
Bwd IAT Max                               0.637725               0.215992   
Bwd IAT Min                               0.043402               0.317312   
Fwd Header Length                         0.488606               0.580049   
Bwd Header Length                         0.559730               0.468355   
Fwd Packets/s                            -0.064800              -0.024095   
Bwd Packets/s                            -0.046593              -0.034213   
Packet Length Max                         0.601459               0.693306   
Packet Length Mean                        0.530830               0.624118   
Packet Length Std                         0.485171               0.605821   
Packet Length Variance                    0.521250               0.610889   
Avg Packet Size                           0.484636               0.592635   
Avg Fwd Segment Size                      0.433777               0.876929   
Avg Bwd Segment Size                      0.556433               0.387529   
Subflow Fwd Packets                       0.536628               0.535619   
Subflow Fwd Bytes                         0.470164               0.796363   
Subflow Bwd Packets                       0.667482               0.431439   
Subflow Bwd Bytes                         1.000000               0.451904   
Init Fwd Win Bytes                        0.058459               0.250035   
Init Bwd Win Bytes                        0.626598               0.463152   
Fwd Act Data Packets                      0.495346               0.397308   
Fwd Seg Size Min                          0.053893               0.267014   
attack_id                                -0.035668              -0.114476   

                          Fwd Packet Length Mean  Fwd Packet Length Std  \
Flow Duration                           0.223883               0.258275   
Total Fwd Packets                       0.443021               0.460252   
Total Backward Packets                  0.454429               0.418142   
Fwd Packets Length Total                0.673515               0.656346   
Bwd Packets Length Total                0.433777               0.401362   
Fwd Packet Length Max                   0.876929               0.910410   
Fwd Packet Length Mean                  1.000000               0.906266   
Fwd Packet Length Std                   0.906266               1.000000   
Bwd Packet Length Max                   0.389350               0.396018   
Bwd Packet Length Mean                  0.430169               0.376194   
Bwd Packet Length Std                   0.406844               0.451969   
Flow Bytes/s                            0.088291               0.039993   
Flow Packets/s                          0.079698               0.047994   
Flow IAT Mean                           0.132827               0.174096   
Flow IAT Std                            0.350501               0.379778   
Flow IAT Max                            0.116476               0.219872   
Flow IAT Min                           -0.096717              -0.162102   
Fwd IAT Total                           0.321079               0.333408   
Fwd IAT Mean                            0.314509               0.330904   
Fwd IAT Std                             0.285779               0.229975   
Fwd IAT Max                             0.178340               0.270654   
Fwd IAT Min                             0.065737               0.093723   
Bwd IAT Total                           0.203509               0.160607   
Bwd IAT Mean                            0.018704               0.077178   
Bwd IAT Std                             0.242220               0.221190   
Bwd IAT Max                             0.166386               0.154433   
Bwd IAT Min                             0.315777               0.378219   
Fwd Header Length                       0.467705               0.530178   
Bwd Header Length                       0.462312               0.456745   
Fwd Packets/s                           0.065484               0.035362   
Bwd Packets/s                           0.053065              -0.014289   
Packet Length Max                       0.611326               0.634520   
Packet Length Mean                      0.666252               0.595388   
Packet Length Std                       0.567894               0.610741   
Packet Length Variance                  0.576243               0.627071   
Avg Packet Size                         0.656824               0.563593   
Avg Fwd Segment Size                    1.000000               0.906266   
Avg Bwd Segment Size                    0.430169               0.376194   
Subflow Fwd Packets                     0.443021               0.460252   
Subflow Fwd Bytes                       0.673515               0.656346   
Subflow Bwd Packets                     0.454429               0.418142   
Subflow Bwd Bytes                       0.433777               0.401362   
Init Fwd Win Bytes                      0.261404               0.313570   
Init Bwd Win Bytes                      0.415983               0.390927   
Fwd Act Data Packets                    0.323995               0.282448   
Fwd Seg Size Min                        0.226251               0.302648   
attack_id                              -0.196100              -0.141036   

                          Bwd Packet Length Max  Bwd Packet Length Mean  ...  \
Flow Duration                          0.193896                0.069250  ...   
Total Fwd Packets                      0.559189                0.389019  ...   
Total Backward Packets                 0.626985                0.530979  ...   
Fwd Packets Length Total               0.547430                0.397051  ...   
Bwd Packets Length Total               0.642817                0.556433  ...   
Fwd Packet Length Max                  0.459193                0.387529  ...   
Fwd Packet Length Mean                 0.389350                0.430169  ...   
Fwd Packet Length Std                  0.396018                0.376194  ...   
Bwd Packet Length Max                  1.000000                0.745937  ...   
Bwd Packet Length Mean                 0.745937                1.000000  ...   
Bwd Packet Length Std                  0.905455                0.713763  ...   
Flow Bytes/s                           0.027773                0.059257  ...   
Flow Packets/s                        -0.067909                0.017235  ...   
Flow IAT Mean                          0.122742                0.037822  ...   
Flow IAT Std                           0.130569                0.079935  ...   
Flow IAT Max                           0.291129                0.132577  ...   
Flow IAT Min                          -0.128903               -0.045374  ...   
Fwd IAT Total                          0.192159                0.070620  ...   
Fwd IAT Mean                           0.110592                0.029596  ...   
Fwd IAT Std                            0.400185                0.257838  ...   
Fwd IAT Max                            0.303202                0.145865  ...   
Fwd IAT Min                            0.239714                0.143203  ...   
Bwd IAT Total                          0.388287                0.314509  ...   
Bwd IAT Mean                           0.308447                0.216200  ...   
Bwd IAT Std                            0.415092                0.319511  ...   
Bwd IAT Max                            0.441702                0.364072  ...   
Bwd IAT Min                            0.021272                0.069716  ...   
Fwd Header Length                      0.566809                0.398443  ...   
Bwd Header Length                      0.638425                0.530512  ...   
Fwd Packets/s                         -0.079164                0.011930  ...   
Bwd Packets/s                         -0.015688                0.105039  ...   
Packet Length Max                      0.916820                0.702215  ...   
Packet Length Mean                     0.736857                0.808163  ...   
Packet Length Std                      0.850145                0.682757  ...   
Packet Length Variance                 0.678489                0.724085  ...   
Avg Packet Size                        0.695708                0.801836  ...   
Avg Fwd Segment Size                   0.389350                0.430169  ...   
Avg Bwd Segment Size                   0.745937                1.000000  ...   
Subflow Fwd Packets                    0.559189                0.389019  ...   
Subflow Fwd Bytes                      0.547430                0.397051  ...   
Subflow Bwd Packets                    0.626985                0.530979  ...   
Subflow Bwd Bytes                      0.642817                0.556433  ...   
Init Fwd Win Bytes                     0.082989                0.125144  ...   
Init Bwd Win Bytes                     0.383273                0.277818  ...   
Fwd Act Data Packets                   0.585226                0.415084  ...   
Fwd Seg Size Min                       0.082755                0.017201  ...   
attack_id                             -0.064293               -0.085765  ...   

                          Avg Bwd Segment Size  Subflow Fwd Packets  \
Flow Duration                         0.069250             0.344572   
Total Fwd Packets                     0.389019             1.000000   
Total Backward Packets                0.530979             0.693930   
Fwd Packets Length Total              0.397051             0.580110   
Bwd Packets Length Total              0.556433             0.536628   
Fwd Packet Length Max                 0.387529             0.535619   
Fwd Packet Length Mean                0.430169             0.443021   
Fwd Packet Length Std                 0.376194             0.460252   
Bwd Packet Length Max                 0.745937             0.559189   
Bwd Packet Length Mean                1.000000             0.389019   
Bwd Packet Length Std                 0.713763             0.526333   
Flow Bytes/s                          0.059257            -0.056336   
Flow Packets/s                        0.017235            -0.142290   
Flow IAT Mean                         0.037822             0.155158   
Flow IAT Std                          0.079935             0.229385   
Flow IAT Max                          0.132577             0.198171   
Flow IAT Min                         -0.045374            -0.165506   
Fwd IAT Total                         0.070620             0.396678   
Fwd IAT Mean                          0.029596             0.244505   
Fwd IAT Std                           0.257838             0.526483   
Fwd IAT Max                           0.145865             0.225066   
Fwd IAT Min                           0.143203             0.121484   
Bwd IAT Total                         0.314509             0.324204   
Bwd IAT Mean                          0.216200             0.192164   
Bwd IAT Std                           0.319511             0.448569   
Bwd IAT Max                           0.364072             0.410429   
Bwd IAT Min                           0.069716             0.098966   
Fwd Header Length                     0.398443             0.806061   
Bwd Header Length                     0.530512             0.711120   
Fwd Packets/s                         0.011930            -0.141910   
Bwd Packets/s                         0.105039            -0.126428   
Packet Length Max                     0.702215             0.591023   
Packet Length Mean                    0.808163             0.499634   
Packet Length Std                     0.682757             0.500338   
Packet Length Variance                0.724085             0.456418   
Avg Packet Size                       0.801836             0.444920   
Avg Fwd Segment Size                  0.430169             0.443021   
Avg Bwd Segment Size                  1.000000             0.389019   
Subflow Fwd Packets                   0.389019             1.000000   
Subflow Fwd Bytes                     0.397051             0.580110   
Subflow Bwd Packets                   0.530979             0.693930   
Subflow Bwd Bytes                     0.556433             0.536628   
Init Fwd Win Bytes                    0.125144             0.028631   
Init Bwd Win Bytes                    0.277818             0.541543   
Fwd Act Data Packets                  0.415084             0.755914   
Fwd Seg Size Min                      0.017201             0.151001   
attack_id                            -0.085765            -0.039465   

                          Subflow Fwd Bytes  Subflow Bwd Packets  \
Flow Duration                      0.266345             0.284508   
Total Fwd Packets                  0.580110             0.693930   
Total Backward Packets             0.457674             1.000000   
Fwd Packets Length Total           1.000000             0.457674   
Bwd Packets Length Total           0.470164             0.667482   
Fwd Packet Length Max              0.796363             0.431439   
Fwd Packet Length Mean             0.673515             0.454429   
Fwd Packet Length Std              0.656346             0.418142   
Bwd Packet Length Max              0.547430             0.626985   
Bwd Packet Length Mean             0.397051             0.530979   
Bwd Packet Length Std              0.512406             0.589303   
Flow Bytes/s                      -0.001175             0.053830   
Flow Packets/s                    -0.068197            -0.046278   
Flow IAT Mean                      0.200397             0.098510   
Flow IAT Std                       0.312305             0.192020   
Flow IAT Max                       0.232288             0.129965   
Flow IAT Min                      -0.146314            -0.113239   
Fwd IAT Total                      0.350136             0.272520   
Fwd IAT Mean                       0.293066             0.145574   
Fwd IAT Std                        0.413410             0.513079   
Fwd IAT Max                        0.292654             0.106457   
Fwd IAT Min                        0.101196             0.181549   
Bwd IAT Total                      0.259034             0.476214   
Bwd IAT Mean                       0.183088             0.163481   
Bwd IAT Std                        0.339885             0.512708   
Bwd IAT Max                        0.281791             0.521269   
Bwd IAT Min                        0.159474             0.075412   
Fwd Header Length                  0.570863             0.615261   
Bwd Header Length                  0.489739             0.846017   
Fwd Packets/s                     -0.072082            -0.068510   
Bwd Packets/s                     -0.022332            -0.013203   
Packet Length Max                  0.680678             0.596776   
Packet Length Mean                 0.577650             0.570523   
Packet Length Std                  0.554814             0.552656   
Packet Length Variance             0.514811             0.457982   
Avg Packet Size                    0.535458             0.537342   
Avg Fwd Segment Size               0.673515             0.454429   
Avg Bwd Segment Size               0.397051             0.530979   
Subflow Fwd Packets                0.580110             0.693930   
Subflow Fwd Bytes                  1.000000             0.457674   
Subflow Bwd Packets                0.457674             1.000000   
Subflow Bwd Bytes                  0.470164             0.667482   
Init Fwd Win Bytes                 0.148935            -0.018963   
Init Bwd Win Bytes                 0.477821             0.501559   
Fwd Act Data Packets               0.559723             0.602988   
Fwd Seg Size Min                   0.200562             0.124126   
attack_id                         -0.085335            -0.064490   

                          Subflow Bwd Bytes  Init Fwd Win Bytes  \
Flow Duration                      0.227557           -0.136505   
Total Fwd Packets                  0.536628            0.028631   
Total Backward Packets             0.667482           -0.018963   
Fwd Packets Length Total           0.470164            0.148935   
Bwd Packets Length Total           1.000000            0.058459   
Fwd Packet Length Max              0.451904            0.250035   
Fwd Packet Length Mean             0.433777            0.261404   
Fwd Packet Length Std              0.401362            0.313570   
Bwd Packet Length Max              0.642817            0.082989   
Bwd Packet Length Mean             0.556433            0.125144   
Bwd Packet Length Std              0.560662            0.169575   
Flow Bytes/s                      -0.043166            0.062303   
Flow Packets/s                    -0.054143            0.192189   
Flow IAT Mean                      0.038200           -0.113184   
Flow IAT Std                       0.142081           -0.153843   
Flow IAT Max                       0.117542           -0.107938   
Flow IAT Min                      -0.075916           -0.073047   
Fwd IAT Total                      0.261268           -0.131350   
Fwd IAT Mean                       0.119108           -0.124701   
Fwd IAT Std                        0.606764           -0.037134   
Fwd IAT Max                        0.145319           -0.092624   
Fwd IAT Min                        0.152866            0.077115   
Bwd IAT Total                      0.539414           -0.045929   
Bwd IAT Mean                       0.119134           -0.069465   
Bwd IAT Std                        0.601234           -0.068592   
Bwd IAT Max                        0.637725           -0.070402   
Bwd IAT Min                        0.043402            0.246999   
Fwd Header Length                  0.488606            0.013206   
Bwd Header Length                  0.559730           -0.039221   
Fwd Packets/s                     -0.064800            0.188806   
Bwd Packets/s                     -0.046593            0.130409   
Packet Length Max                  0.601459            0.171932   
Packet Length Mean                 0.530830            0.183537   
Packet Length Std                  0.485171            0.201096   
Packet Length Variance             0.521250            0.236211   
Avg Packet Size                    0.484636            0.178445   
Avg Fwd Segment Size               0.433777            0.261404   
Avg Bwd Segment Size               0.556433            0.125144   
Subflow Fwd Packets                0.536628            0.028631   
Subflow Fwd Bytes                  0.470164            0.148935   
Subflow Bwd Packets                0.667482           -0.018963   
Subflow Bwd Bytes                  1.000000            0.058459   
Init Fwd Win Bytes                 0.058459            1.000000   
Init Bwd Win Bytes                 0.626598            0.152317   
Fwd Act Data Packets               0.495346           -0.036225   
Fwd Seg Size Min                   0.053893            0.032016   
attack_id                         -0.035668           -0.024570   

                          Init Bwd Win Bytes  Fwd Act Data Packets  \
Flow Duration                       0.220855              0.245885   
Total Fwd Packets                   0.541543              0.755914   
Total Backward Packets              0.501559              0.602988   
Fwd Packets Length Total            0.477821              0.559723   
Bwd Packets Length Total            0.626598              0.495346   
Fwd Packet Length Max               0.463152              0.397308   
Fwd Packet Length Mean              0.415983              0.323995   
Fwd Packet Length Std               0.390927              0.282448   
Bwd Packet Length Max               0.383273              0.585226   
Bwd Packet Length Mean              0.277818              0.415084   
Bwd Packet Length Std               0.315921              0.529967   
Flow Bytes/s                       -0.084479             -0.040516   
Flow Packets/s                     -0.079840             -0.139593   
Flow IAT Mean                       0.024429              0.145694   
Flow IAT Std                        0.022531              0.130074   
Flow IAT Max                       -0.014380              0.185398   
Flow IAT Min                       -0.040549             -0.130984   
Fwd IAT Total                       0.291423              0.296800   
Fwd IAT Mean                        0.104470              0.180328   
Fwd IAT Std                         0.699467              0.485163   
Fwd IAT Max                         0.029922              0.221819   
Fwd IAT Min                        -0.004198              0.112142   
Bwd IAT Total                       0.470894              0.332644   
Bwd IAT Mean                       -0.046640              0.291875   
Bwd IAT Std                         0.558057              0.408957   
Bwd IAT Max                         0.513751              0.377197   
Bwd IAT Min                         0.064453             -0.009818   
Fwd Header Length                   0.424312              0.667525   
Bwd Header Length                   0.380560              0.616584   
Fwd Packets/s                      -0.073528             -0.143159   
Bwd Packets/s                      -0.069479             -0.095185   
Packet Length Max                   0.393128              0.558498   
Packet Length Mean                  0.320431              0.481511   
Packet Length Std                   0.287666              0.470960   
Packet Length Variance              0.344496              0.378085   
Avg Packet Size                     0.274867              0.434184   
Avg Fwd Segment Size                0.415983              0.323995   
Avg Bwd Segment Size                0.277818              0.415084   
Subflow Fwd Packets                 0.541543              0.755914   
Subflow Fwd Bytes                   0.477821              0.559723   
Subflow Bwd Packets                 0.501559              0.602988   
Subflow Bwd Bytes                   0.626598              0.495346   
Init Fwd Win Bytes                  0.152317             -0.036225   
Init Bwd Win Bytes                  1.000000              0.467861   
Fwd Act Data Packets                0.467861              1.000000   
Fwd Seg Size Min                   -0.034003              0.049872   
attack_id                          -0.188538             -0.086071   

                          Fwd Seg Size Min  attack_id  
Flow Duration                     0.172877   0.026159  
Total Fwd Packets                 0.151001  -0.039465  
Total Backward Packets            0.124126  -0.064490  
Fwd Packets Length Total          0.200562  -0.085335  
Bwd Packets Length Total          0.053893  -0.035668  
Fwd Packet Length Max             0.267014  -0.114476  
Fwd Packet Length Mean            0.226251  -0.196100  
Fwd Packet Length Std             0.302648  -0.141036  
Bwd Packet Length Max             0.082755  -0.064293  
Bwd Packet Length Mean            0.017201  -0.085765  
Bwd Packet Length Std             0.127629  -0.039774  
Flow Bytes/s                     -0.078757  -0.014417  
Flow Packets/s                   -0.120632   0.040890  
Flow IAT Mean                     0.148807   0.016617  
Flow IAT Std                      0.246742   0.012121  
Flow IAT Max                      0.162000  -0.049590  
Flow IAT Min                     -0.309198  -0.098165  
Fwd IAT Total                     0.204756  -0.017367  
Fwd IAT Mean                      0.221853  -0.019111  
Fwd IAT Std                      -0.002423  -0.121112  
Fwd IAT Max                       0.178686  -0.085004  
Fwd IAT Min                       0.057986   0.108564  
Bwd IAT Total                     0.017300   0.046394  
Bwd IAT Mean                      0.057588  -0.032041  
Bwd IAT Std                       0.029104   0.013587  
Bwd IAT Max                       0.011774   0.098360  
Bwd IAT Min                       0.107613  -0.056871  
Fwd Header Length                 0.327892  -0.021366  
Bwd Header Length                 0.264664  -0.029029  
Fwd Packets/s                    -0.127855   0.037160  
Bwd Packets/s                    -0.072828  -0.080766  
Packet Length Max                 0.154635  -0.061398  
Packet Length Mean                0.099643  -0.098992  
Packet Length Std                 0.152382  -0.097046  
Packet Length Variance            0.172997  -0.023701  
Avg Packet Size                   0.058608  -0.103647  
Avg Fwd Segment Size              0.226251  -0.196100  
Avg Bwd Segment Size              0.017201  -0.085765  
Subflow Fwd Packets               0.151001  -0.039465  
Subflow Fwd Bytes                 0.200562  -0.085335  
Subflow Bwd Packets               0.124126  -0.064490  
Subflow Bwd Bytes                 0.053893  -0.035668  
Init Fwd Win Bytes                0.032016  -0.024570  
Init Bwd Win Bytes               -0.034003  -0.188538  
Fwd Act Data Packets              0.049872  -0.086071  
Fwd Seg Size Min                  1.000000   0.109920  
attack_id                         0.109920   1.000000  

[47 rows x 47 columns]
In [124]:
for i in range(len(corr_matrix)):
    print(f"Row {i}: {corr_matrix.iloc[i]}")
Row 0: Flow Duration               1.000000
Total Fwd Packets           0.344572
Total Backward Packets      0.284508
Fwd Packets Length Total    0.266345
Bwd Packets Length Total    0.227557
Fwd Packet Length Max       0.291309
Fwd Packet Length Mean      0.223883
Fwd Packet Length Std       0.258275
Bwd Packet Length Max       0.193896
Bwd Packet Length Mean      0.069250
Bwd Packet Length Std       0.208548
Flow Bytes/s               -0.168463
Flow Packets/s             -0.201210
Flow IAT Mean               0.527404
Flow IAT Std                0.565126
Flow IAT Max                0.530732
Flow IAT Min               -0.087898
Fwd IAT Total               0.780641
Fwd IAT Mean                0.518293
Fwd IAT Std                 0.262062
Fwd IAT Max                 0.359788
Fwd IAT Min                 0.042262
Bwd IAT Total               0.124080
Bwd IAT Mean                0.020241
Bwd IAT Std                 0.206601
Bwd IAT Max                 0.209195
Bwd IAT Min                 0.110107
Fwd Header Length           0.330307
Bwd Header Length           0.294374
Fwd Packets/s              -0.201649
Bwd Packets/s              -0.215691
Packet Length Max           0.244895
Packet Length Mean          0.160033
Packet Length Std           0.211278
Packet Length Variance      0.183152
Avg Packet Size             0.131517
Avg Fwd Segment Size        0.223883
Avg Bwd Segment Size        0.069250
Subflow Fwd Packets         0.344572
Subflow Fwd Bytes           0.266345
Subflow Bwd Packets         0.284508
Subflow Bwd Bytes           0.227557
Init Fwd Win Bytes         -0.136505
Init Bwd Win Bytes          0.220855
Fwd Act Data Packets        0.245885
Fwd Seg Size Min            0.172877
attack_id                   0.026159
Name: Flow Duration, dtype: float64
Row 1: Flow Duration               0.344572
Total Fwd Packets           1.000000
Total Backward Packets      0.693930
Fwd Packets Length Total    0.580110
Bwd Packets Length Total    0.536628
Fwd Packet Length Max       0.535619
Fwd Packet Length Mean      0.443021
Fwd Packet Length Std       0.460252
Bwd Packet Length Max       0.559189
Bwd Packet Length Mean      0.389019
Bwd Packet Length Std       0.526333
Flow Bytes/s               -0.056336
Flow Packets/s             -0.142290
Flow IAT Mean               0.155158
Flow IAT Std                0.229385
Flow IAT Max                0.198171
Flow IAT Min               -0.165506
Fwd IAT Total               0.396678
Fwd IAT Mean                0.244505
Fwd IAT Std                 0.526483
Fwd IAT Max                 0.225066
Fwd IAT Min                 0.121484
Bwd IAT Total               0.324204
Bwd IAT Mean                0.192164
Bwd IAT Std                 0.448569
Bwd IAT Max                 0.410429
Bwd IAT Min                 0.098966
Fwd Header Length           0.806061
Bwd Header Length           0.711120
Fwd Packets/s              -0.141910
Bwd Packets/s              -0.126428
Packet Length Max           0.591023
Packet Length Mean          0.499634
Packet Length Std           0.500338
Packet Length Variance      0.456418
Avg Packet Size             0.444920
Avg Fwd Segment Size        0.443021
Avg Bwd Segment Size        0.389019
Subflow Fwd Packets         1.000000
Subflow Fwd Bytes           0.580110
Subflow Bwd Packets         0.693930
Subflow Bwd Bytes           0.536628
Init Fwd Win Bytes          0.028631
Init Bwd Win Bytes          0.541543
Fwd Act Data Packets        0.755914
Fwd Seg Size Min            0.151001
attack_id                  -0.039465
Name: Total Fwd Packets, dtype: float64
Row 2: Flow Duration               0.284508
Total Fwd Packets           0.693930
Total Backward Packets      1.000000
Fwd Packets Length Total    0.457674
Bwd Packets Length Total    0.667482
Fwd Packet Length Max       0.431439
Fwd Packet Length Mean      0.454429
Fwd Packet Length Std       0.418142
Bwd Packet Length Max       0.626985
Bwd Packet Length Mean      0.530979
Bwd Packet Length Std       0.589303
Flow Bytes/s                0.053830
Flow Packets/s             -0.046278
Flow IAT Mean               0.098510
Flow IAT Std                0.192020
Flow IAT Max                0.129965
Flow IAT Min               -0.113239
Fwd IAT Total               0.272520
Fwd IAT Mean                0.145574
Fwd IAT Std                 0.513079
Fwd IAT Max                 0.106457
Fwd IAT Min                 0.181549
Bwd IAT Total               0.476214
Bwd IAT Mean                0.163481
Bwd IAT Std                 0.512708
Bwd IAT Max                 0.521269
Bwd IAT Min                 0.075412
Fwd Header Length           0.615261
Bwd Header Length           0.846017
Fwd Packets/s              -0.068510
Bwd Packets/s              -0.013203
Packet Length Max           0.596776
Packet Length Mean          0.570523
Packet Length Std           0.552656
Packet Length Variance      0.457982
Avg Packet Size             0.537342
Avg Fwd Segment Size        0.454429
Avg Bwd Segment Size        0.530979
Subflow Fwd Packets         0.693930
Subflow Fwd Bytes           0.457674
Subflow Bwd Packets         1.000000
Subflow Bwd Bytes           0.667482
Init Fwd Win Bytes         -0.018963
Init Bwd Win Bytes          0.501559
Fwd Act Data Packets        0.602988
Fwd Seg Size Min            0.124126
attack_id                  -0.064490
Name: Total Backward Packets, dtype: float64
Row 3: Flow Duration               0.266345
Total Fwd Packets           0.580110
Total Backward Packets      0.457674
Fwd Packets Length Total    1.000000
Bwd Packets Length Total    0.470164
Fwd Packet Length Max       0.796363
Fwd Packet Length Mean      0.673515
Fwd Packet Length Std       0.656346
Bwd Packet Length Max       0.547430
Bwd Packet Length Mean      0.397051
Bwd Packet Length Std       0.512406
Flow Bytes/s               -0.001175
Flow Packets/s             -0.068197
Flow IAT Mean               0.200397
Flow IAT Std                0.312305
Flow IAT Max                0.232288
Flow IAT Min               -0.146314
Fwd IAT Total               0.350136
Fwd IAT Mean                0.293066
Fwd IAT Std                 0.413410
Fwd IAT Max                 0.292654
Fwd IAT Min                 0.101196
Bwd IAT Total               0.259034
Bwd IAT Mean                0.183088
Bwd IAT Std                 0.339885
Bwd IAT Max                 0.281791
Bwd IAT Min                 0.159474
Fwd Header Length           0.570863
Bwd Header Length           0.489739
Fwd Packets/s              -0.072082
Bwd Packets/s              -0.022332
Packet Length Max           0.680678
Packet Length Mean          0.577650
Packet Length Std           0.554814
Packet Length Variance      0.514811
Avg Packet Size             0.535458
Avg Fwd Segment Size        0.673515
Avg Bwd Segment Size        0.397051
Subflow Fwd Packets         0.580110
Subflow Fwd Bytes           1.000000
Subflow Bwd Packets         0.457674
Subflow Bwd Bytes           0.470164
Init Fwd Win Bytes          0.148935
Init Bwd Win Bytes          0.477821
Fwd Act Data Packets        0.559723
Fwd Seg Size Min            0.200562
attack_id                  -0.085335
Name: Fwd Packets Length Total, dtype: float64
Row 4: Flow Duration               0.227557
Total Fwd Packets           0.536628
Total Backward Packets      0.667482
Fwd Packets Length Total    0.470164
Bwd Packets Length Total    1.000000
Fwd Packet Length Max       0.451904
Fwd Packet Length Mean      0.433777
Fwd Packet Length Std       0.401362
Bwd Packet Length Max       0.642817
Bwd Packet Length Mean      0.556433
Bwd Packet Length Std       0.560662
Flow Bytes/s               -0.043166
Flow Packets/s             -0.054143
Flow IAT Mean               0.038200
Flow IAT Std                0.142081
Flow IAT Max                0.117542
Flow IAT Min               -0.075916
Fwd IAT Total               0.261268
Fwd IAT Mean                0.119108
Fwd IAT Std                 0.606764
Fwd IAT Max                 0.145319
Fwd IAT Min                 0.152866
Bwd IAT Total               0.539414
Bwd IAT Mean                0.119134
Bwd IAT Std                 0.601234
Bwd IAT Max                 0.637725
Bwd IAT Min                 0.043402
Fwd Header Length           0.488606
Bwd Header Length           0.559730
Fwd Packets/s              -0.064800
Bwd Packets/s              -0.046593
Packet Length Max           0.601459
Packet Length Mean          0.530830
Packet Length Std           0.485171
Packet Length Variance      0.521250
Avg Packet Size             0.484636
Avg Fwd Segment Size        0.433777
Avg Bwd Segment Size        0.556433
Subflow Fwd Packets         0.536628
Subflow Fwd Bytes           0.470164
Subflow Bwd Packets         0.667482
Subflow Bwd Bytes           1.000000
Init Fwd Win Bytes          0.058459
Init Bwd Win Bytes          0.626598
Fwd Act Data Packets        0.495346
Fwd Seg Size Min            0.053893
attack_id                  -0.035668
Name: Bwd Packets Length Total, dtype: float64
Row 5: Flow Duration               0.291309
Total Fwd Packets           0.535619
Total Backward Packets      0.431439
Fwd Packets Length Total    0.796363
Bwd Packets Length Total    0.451904
Fwd Packet Length Max       1.000000
Fwd Packet Length Mean      0.876929
Fwd Packet Length Std       0.910410
Bwd Packet Length Max       0.459193
Bwd Packet Length Mean      0.387529
Bwd Packet Length Std       0.475939
Flow Bytes/s               -0.002276
Flow Packets/s             -0.017606
Flow IAT Mean               0.206706
Flow IAT Std                0.403239
Flow IAT Max                0.256882
Flow IAT Min               -0.159033
Fwd IAT Total               0.383010
Fwd IAT Mean                0.364603
Fwd IAT Std                 0.324852
Fwd IAT Max                 0.323373
Fwd IAT Min                 0.067656
Bwd IAT Total               0.198299
Bwd IAT Mean                0.113974
Bwd IAT Std                 0.289248
Bwd IAT Max                 0.215992
Bwd IAT Min                 0.317312
Fwd Header Length           0.580049
Bwd Header Length           0.468355
Fwd Packets/s              -0.024095
Bwd Packets/s              -0.034213
Packet Length Max           0.693306
Packet Length Mean          0.624118
Packet Length Std           0.605821
Packet Length Variance      0.610889
Avg Packet Size             0.592635
Avg Fwd Segment Size        0.876929
Avg Bwd Segment Size        0.387529
Subflow Fwd Packets         0.535619
Subflow Fwd Bytes           0.796363
Subflow Bwd Packets         0.431439
Subflow Bwd Bytes           0.451904
Init Fwd Win Bytes          0.250035
Init Bwd Win Bytes          0.463152
Fwd Act Data Packets        0.397308
Fwd Seg Size Min            0.267014
attack_id                  -0.114476
Name: Fwd Packet Length Max, dtype: float64
Row 6: Flow Duration               0.223883
Total Fwd Packets           0.443021
Total Backward Packets      0.454429
Fwd Packets Length Total    0.673515
Bwd Packets Length Total    0.433777
Fwd Packet Length Max       0.876929
Fwd Packet Length Mean      1.000000
Fwd Packet Length Std       0.906266
Bwd Packet Length Max       0.389350
Bwd Packet Length Mean      0.430169
Bwd Packet Length Std       0.406844
Flow Bytes/s                0.088291
Flow Packets/s              0.079698
Flow IAT Mean               0.132827
Flow IAT Std                0.350501
Flow IAT Max                0.116476
Flow IAT Min               -0.096717
Fwd IAT Total               0.321079
Fwd IAT Mean                0.314509
Fwd IAT Std                 0.285779
Fwd IAT Max                 0.178340
Fwd IAT Min                 0.065737
Bwd IAT Total               0.203509
Bwd IAT Mean                0.018704
Bwd IAT Std                 0.242220
Bwd IAT Max                 0.166386
Bwd IAT Min                 0.315777
Fwd Header Length           0.467705
Bwd Header Length           0.462312
Fwd Packets/s               0.065484
Bwd Packets/s               0.053065
Packet Length Max           0.611326
Packet Length Mean          0.666252
Packet Length Std           0.567894
Packet Length Variance      0.576243
Avg Packet Size             0.656824
Avg Fwd Segment Size        1.000000
Avg Bwd Segment Size        0.430169
Subflow Fwd Packets         0.443021
Subflow Fwd Bytes           0.673515
Subflow Bwd Packets         0.454429
Subflow Bwd Bytes           0.433777
Init Fwd Win Bytes          0.261404
Init Bwd Win Bytes          0.415983
Fwd Act Data Packets        0.323995
Fwd Seg Size Min            0.226251
attack_id                  -0.196100
Name: Fwd Packet Length Mean, dtype: float64
Row 7: Flow Duration               0.258275
Total Fwd Packets           0.460252
Total Backward Packets      0.418142
Fwd Packets Length Total    0.656346
Bwd Packets Length Total    0.401362
Fwd Packet Length Max       0.910410
Fwd Packet Length Mean      0.906266
Fwd Packet Length Std       1.000000
Bwd Packet Length Max       0.396018
Bwd Packet Length Mean      0.376194
Bwd Packet Length Std       0.451969
Flow Bytes/s                0.039993
Flow Packets/s              0.047994
Flow IAT Mean               0.174096
Flow IAT Std                0.379778
Flow IAT Max                0.219872
Flow IAT Min               -0.162102
Fwd IAT Total               0.333408
Fwd IAT Mean                0.330904
Fwd IAT Std                 0.229975
Fwd IAT Max                 0.270654
Fwd IAT Min                 0.093723
Bwd IAT Total               0.160607
Bwd IAT Mean                0.077178
Bwd IAT Std                 0.221190
Bwd IAT Max                 0.154433
Bwd IAT Min                 0.378219
Fwd Header Length           0.530178
Bwd Header Length           0.456745
Fwd Packets/s               0.035362
Bwd Packets/s              -0.014289
Packet Length Max           0.634520
Packet Length Mean          0.595388
Packet Length Std           0.610741
Packet Length Variance      0.627071
Avg Packet Size             0.563593
Avg Fwd Segment Size        0.906266
Avg Bwd Segment Size        0.376194
Subflow Fwd Packets         0.460252
Subflow Fwd Bytes           0.656346
Subflow Bwd Packets         0.418142
Subflow Bwd Bytes           0.401362
Init Fwd Win Bytes          0.313570
Init Bwd Win Bytes          0.390927
Fwd Act Data Packets        0.282448
Fwd Seg Size Min            0.302648
attack_id                  -0.141036
Name: Fwd Packet Length Std, dtype: float64
Row 8: Flow Duration               0.193896
Total Fwd Packets           0.559189
Total Backward Packets      0.626985
Fwd Packets Length Total    0.547430
Bwd Packets Length Total    0.642817
Fwd Packet Length Max       0.459193
Fwd Packet Length Mean      0.389350
Fwd Packet Length Std       0.396018
Bwd Packet Length Max       1.000000
Bwd Packet Length Mean      0.745937
Bwd Packet Length Std       0.905455
Flow Bytes/s                0.027773
Flow Packets/s             -0.067909
Flow IAT Mean               0.122742
Flow IAT Std                0.130569
Flow IAT Max                0.291129
Flow IAT Min               -0.128903
Fwd IAT Total               0.192159
Fwd IAT Mean                0.110592
Fwd IAT Std                 0.400185
Fwd IAT Max                 0.303202
Fwd IAT Min                 0.239714
Bwd IAT Total               0.388287
Bwd IAT Mean                0.308447
Bwd IAT Std                 0.415092
Bwd IAT Max                 0.441702
Bwd IAT Min                 0.021272
Fwd Header Length           0.566809
Bwd Header Length           0.638425
Fwd Packets/s              -0.079164
Bwd Packets/s              -0.015688
Packet Length Max           0.916820
Packet Length Mean          0.736857
Packet Length Std           0.850145
Packet Length Variance      0.678489
Avg Packet Size             0.695708
Avg Fwd Segment Size        0.389350
Avg Bwd Segment Size        0.745937
Subflow Fwd Packets         0.559189
Subflow Fwd Bytes           0.547430
Subflow Bwd Packets         0.626985
Subflow Bwd Bytes           0.642817
Init Fwd Win Bytes          0.082989
Init Bwd Win Bytes          0.383273
Fwd Act Data Packets        0.585226
Fwd Seg Size Min            0.082755
attack_id                  -0.064293
Name: Bwd Packet Length Max, dtype: float64
Row 9: Flow Duration               0.069250
Total Fwd Packets           0.389019
Total Backward Packets      0.530979
Fwd Packets Length Total    0.397051
Bwd Packets Length Total    0.556433
Fwd Packet Length Max       0.387529
Fwd Packet Length Mean      0.430169
Fwd Packet Length Std       0.376194
Bwd Packet Length Max       0.745937
Bwd Packet Length Mean      1.000000
Bwd Packet Length Std       0.713763
Flow Bytes/s                0.059257
Flow Packets/s              0.017235
Flow IAT Mean               0.037822
Flow IAT Std                0.079935
Flow IAT Max                0.132577
Flow IAT Min               -0.045374
Fwd IAT Total               0.070620
Fwd IAT Mean                0.029596
Fwd IAT Std                 0.257838
Fwd IAT Max                 0.145865
Fwd IAT Min                 0.143203
Bwd IAT Total               0.314509
Bwd IAT Mean                0.216200
Bwd IAT Std                 0.319511
Bwd IAT Max                 0.364072
Bwd IAT Min                 0.069716
Fwd Header Length           0.398443
Bwd Header Length           0.530512
Fwd Packets/s               0.011930
Bwd Packets/s               0.105039
Packet Length Max           0.702215
Packet Length Mean          0.808163
Packet Length Std           0.682757
Packet Length Variance      0.724085
Avg Packet Size             0.801836
Avg Fwd Segment Size        0.430169
Avg Bwd Segment Size        1.000000
Subflow Fwd Packets         0.389019
Subflow Fwd Bytes           0.397051
Subflow Bwd Packets         0.530979
Subflow Bwd Bytes           0.556433
Init Fwd Win Bytes          0.125144
Init Bwd Win Bytes          0.277818
Fwd Act Data Packets        0.415084
Fwd Seg Size Min            0.017201
attack_id                  -0.085765
Name: Bwd Packet Length Mean, dtype: float64
Row 10: Flow Duration               0.208548
Total Fwd Packets           0.526333
Total Backward Packets      0.589303
Fwd Packets Length Total    0.512406
Bwd Packets Length Total    0.560662
Fwd Packet Length Max       0.475939
Fwd Packet Length Mean      0.406844
Fwd Packet Length Std       0.451969
Bwd Packet Length Max       0.905455
Bwd Packet Length Mean      0.713763
Bwd Packet Length Std       1.000000
Flow Bytes/s                0.031455
Flow Packets/s             -0.046486
Flow IAT Mean               0.165044
Flow IAT Std                0.185775
Flow IAT Max                0.307785
Flow IAT Min               -0.161815
Fwd IAT Total               0.189628
Fwd IAT Mean                0.144140
Fwd IAT Std                 0.315058
Fwd IAT Max                 0.304261
Fwd IAT Min                 0.230290
Bwd IAT Total               0.356071
Bwd IAT Mean                0.264339
Bwd IAT Std                 0.361693
Bwd IAT Max                 0.404445
Bwd IAT Min                 0.080120
Fwd Header Length           0.534625
Bwd Header Length           0.606914
Fwd Packets/s              -0.060242
Bwd Packets/s              -0.021202
Packet Length Max           0.866421
Packet Length Mean          0.730641
Packet Length Std           0.886948
Packet Length Variance      0.685478
Avg Packet Size             0.701522
Avg Fwd Segment Size        0.406844
Avg Bwd Segment Size        0.713763
Subflow Fwd Packets         0.526333
Subflow Fwd Bytes           0.512406
Subflow Bwd Packets         0.589303
Subflow Bwd Bytes           0.560662
Init Fwd Win Bytes          0.169575
Init Bwd Win Bytes          0.315921
Fwd Act Data Packets        0.529967
Fwd Seg Size Min            0.127629
attack_id                  -0.039774
Name: Bwd Packet Length Std, dtype: float64
Row 11: Flow Duration              -0.168463
Total Fwd Packets          -0.056336
Total Backward Packets      0.053830
Fwd Packets Length Total   -0.001175
Bwd Packets Length Total   -0.043166
Fwd Packet Length Max      -0.002276
Fwd Packet Length Mean      0.088291
Fwd Packet Length Std       0.039993
Bwd Packet Length Max       0.027773
Bwd Packet Length Mean      0.059257
Bwd Packet Length Std       0.031455
Flow Bytes/s                1.000000
Flow Packets/s              0.434222
Flow IAT Mean              -0.178772
Flow IAT Std               -0.133443
Flow IAT Max               -0.175313
Flow IAT Min               -0.041821
Fwd IAT Total              -0.142582
Fwd IAT Mean               -0.143920
Fwd IAT Std                -0.069871
Fwd IAT Max                -0.155532
Fwd IAT Min                 0.036841
Bwd IAT Total              -0.065239
Bwd IAT Mean               -0.143259
Bwd IAT Std                -0.055435
Bwd IAT Max                -0.083577
Bwd IAT Min                -0.052498
Fwd Header Length          -0.082406
Bwd Header Length           0.010898
Fwd Packets/s               0.382884
Bwd Packets/s               0.383061
Packet Length Max           0.031106
Packet Length Mean          0.093525
Packet Length Std           0.062113
Packet Length Variance     -0.019063
Avg Packet Size             0.107747
Avg Fwd Segment Size        0.088291
Avg Bwd Segment Size        0.059257
Subflow Fwd Packets        -0.056336
Subflow Fwd Bytes          -0.001175
Subflow Bwd Packets         0.053830
Subflow Bwd Bytes          -0.043166
Init Fwd Win Bytes          0.062303
Init Bwd Win Bytes         -0.084479
Fwd Act Data Packets       -0.040516
Fwd Seg Size Min           -0.078757
attack_id                  -0.014417
Name: Flow Bytes/s, dtype: float64
Row 12: Flow Duration              -0.201210
Total Fwd Packets          -0.142290
Total Backward Packets     -0.046278
Fwd Packets Length Total   -0.068197
Bwd Packets Length Total   -0.054143
Fwd Packet Length Max      -0.017606
Fwd Packet Length Mean      0.079698
Fwd Packet Length Std       0.047994
Bwd Packet Length Max      -0.067909
Bwd Packet Length Mean      0.017235
Bwd Packet Length Std      -0.046486
Flow Bytes/s                0.434222
Flow Packets/s              1.000000
Flow IAT Mean              -0.194094
Flow IAT Std               -0.159992
Flow IAT Max               -0.195863
Flow IAT Min               -0.060208
Fwd IAT Total              -0.169590
Fwd IAT Mean               -0.160047
Fwd IAT Std                -0.110826
Fwd IAT Max                -0.171152
Fwd IAT Min                 0.025983
Bwd IAT Total              -0.117358
Bwd IAT Mean               -0.147376
Bwd IAT Std                -0.115618
Bwd IAT Max                -0.133601
Bwd IAT Min                -0.069277
Fwd Header Length          -0.153481
Bwd Header Length          -0.089666
Fwd Packets/s               0.835874
Bwd Packets/s               0.258343
Packet Length Max          -0.048260
Packet Length Mean          0.015124
Packet Length Std          -0.015276
Packet Length Variance     -0.014598
Avg Packet Size             0.038576
Avg Fwd Segment Size        0.079698
Avg Bwd Segment Size        0.017235
Subflow Fwd Packets        -0.142290
Subflow Fwd Bytes          -0.068197
Subflow Bwd Packets        -0.046278
Subflow Bwd Bytes          -0.054143
Init Fwd Win Bytes          0.192189
Init Bwd Win Bytes         -0.079840
Fwd Act Data Packets       -0.139593
Fwd Seg Size Min           -0.120632
attack_id                   0.040890
Name: Flow Packets/s, dtype: float64
Row 13: Flow Duration               0.527404
Total Fwd Packets           0.155158
Total Backward Packets      0.098510
Fwd Packets Length Total    0.200397
Bwd Packets Length Total    0.038200
Fwd Packet Length Max       0.206706
Fwd Packet Length Mean      0.132827
Fwd Packet Length Std       0.174096
Bwd Packet Length Max       0.122742
Bwd Packet Length Mean      0.037822
Bwd Packet Length Std       0.165044
Flow Bytes/s               -0.178772
Flow Packets/s             -0.194094
Flow IAT Mean               1.000000
Flow IAT Std                0.488745
Flow IAT Max                0.480091
Flow IAT Min               -0.092538
Fwd IAT Total               0.448291
Fwd IAT Mean                0.623333
Fwd IAT Std                 0.024000
Fwd IAT Max                 0.401846
Fwd IAT Min                 0.032743
Bwd IAT Total               0.011753
Bwd IAT Mean                0.117046
Bwd IAT Std                 0.014825
Bwd IAT Max                 0.035911
Bwd IAT Min                 0.095276
Fwd Header Length           0.175209
Bwd Header Length           0.155082
Fwd Packets/s              -0.193202
Bwd Packets/s              -0.213609
Packet Length Max           0.185374
Packet Length Mean          0.113642
Packet Length Std           0.190332
Packet Length Variance      0.115945
Avg Packet Size             0.099798
Avg Fwd Segment Size        0.132827
Avg Bwd Segment Size        0.037822
Subflow Fwd Packets         0.155158
Subflow Fwd Bytes           0.200397
Subflow Bwd Packets         0.098510
Subflow Bwd Bytes           0.038200
Init Fwd Win Bytes         -0.113184
Init Bwd Win Bytes          0.024429
Fwd Act Data Packets        0.145694
Fwd Seg Size Min            0.148807
attack_id                   0.016617
Name: Flow IAT Mean, dtype: float64
Row 14: Flow Duration               0.565126
Total Fwd Packets           0.229385
Total Backward Packets      0.192020
Fwd Packets Length Total    0.312305
Bwd Packets Length Total    0.142081
Fwd Packet Length Max       0.403239
Fwd Packet Length Mean      0.350501
Fwd Packet Length Std       0.379778
Bwd Packet Length Max       0.130569
Bwd Packet Length Mean      0.079935
Bwd Packet Length Std       0.185775
Flow Bytes/s               -0.133443
Flow Packets/s             -0.159992
Flow IAT Mean               0.488745
Flow IAT Std                1.000000
Flow IAT Max                0.344938
Flow IAT Min               -0.093810
Fwd IAT Total               0.554799
Fwd IAT Mean                0.592206
Fwd IAT Std                 0.082205
Fwd IAT Max                 0.309412
Fwd IAT Min                -0.014954
Bwd IAT Total               0.123286
Bwd IAT Mean               -0.064555
Bwd IAT Std                 0.127745
Bwd IAT Max                 0.167128
Bwd IAT Min                 0.168080
Fwd Header Length           0.267877
Bwd Header Length           0.241152
Fwd Packets/s              -0.159956
Bwd Packets/s              -0.171509
Packet Length Max           0.286885
Packet Length Mean          0.250089
Packet Length Std           0.261937
Packet Length Variance      0.259776
Avg Packet Size             0.238214
Avg Fwd Segment Size        0.350501
Avg Bwd Segment Size        0.079935
Subflow Fwd Packets         0.229385
Subflow Fwd Bytes           0.312305
Subflow Bwd Packets         0.192020
Subflow Bwd Bytes           0.142081
Init Fwd Win Bytes         -0.153843
Init Bwd Win Bytes          0.022531
Fwd Act Data Packets        0.130074
Fwd Seg Size Min            0.246742
attack_id                   0.012121
Name: Flow IAT Std, dtype: float64
Row 15: Flow Duration               0.530732
Total Fwd Packets           0.198171
Total Backward Packets      0.129965
Fwd Packets Length Total    0.232288
Bwd Packets Length Total    0.117542
Fwd Packet Length Max       0.256882
Fwd Packet Length Mean      0.116476
Fwd Packet Length Std       0.219872
Bwd Packet Length Max       0.291129
Bwd Packet Length Mean      0.132577
Bwd Packet Length Std       0.307785
Flow Bytes/s               -0.175313
Flow Packets/s             -0.195863
Flow IAT Mean               0.480091
Flow IAT Std                0.344938
Flow IAT Max                1.000000
Flow IAT Min               -0.089186
Fwd IAT Total               0.418495
Fwd IAT Mean                0.376253
Fwd IAT Std                -0.023027
Fwd IAT Max                 0.889802
Fwd IAT Min                 0.120750
Bwd IAT Total              -0.048240
Bwd IAT Mean                0.500898
Bwd IAT Std                -0.027571
Bwd IAT Max                -0.035833
Bwd IAT Min                 0.136607
Fwd Header Length           0.316224
Bwd Header Length           0.231894
Fwd Packets/s              -0.196603
Bwd Packets/s              -0.220567
Packet Length Max           0.313121
Packet Length Mean          0.168161
Packet Length Std           0.305920
Packet Length Variance      0.227996
Avg Packet Size             0.138955
Avg Fwd Segment Size        0.116476
Avg Bwd Segment Size        0.132577
Subflow Fwd Packets         0.198171
Subflow Fwd Bytes           0.232288
Subflow Bwd Packets         0.129965
Subflow Bwd Bytes           0.117542
Init Fwd Win Bytes         -0.107938
Init Bwd Win Bytes         -0.014380
Fwd Act Data Packets        0.185398
Fwd Seg Size Min            0.162000
attack_id                  -0.049590
Name: Flow IAT Max, dtype: float64
Row 16: Flow Duration              -0.087898
Total Fwd Packets          -0.165506
Total Backward Packets     -0.113239
Fwd Packets Length Total   -0.146314
Bwd Packets Length Total   -0.075916
Fwd Packet Length Max      -0.159033
Fwd Packet Length Mean     -0.096717
Fwd Packet Length Std      -0.162102
Bwd Packet Length Max      -0.128903
Bwd Packet Length Mean     -0.045374
Bwd Packet Length Std      -0.161815
Flow Bytes/s               -0.041821
Flow Packets/s             -0.060208
Flow IAT Mean              -0.092538
Flow IAT Std               -0.093810
Flow IAT Max               -0.089186
Flow IAT Min                1.000000
Fwd IAT Total              -0.071933
Fwd IAT Mean               -0.072768
Fwd IAT Std                -0.031436
Fwd IAT Max                -0.081323
Fwd IAT Min                 0.038207
Bwd IAT Total              -0.053105
Bwd IAT Mean               -0.054524
Bwd IAT Std                -0.055326
Bwd IAT Max                -0.056652
Bwd IAT Min                -0.050071
Fwd Header Length          -0.202540
Bwd Header Length          -0.169858
Fwd Packets/s              -0.060445
Bwd Packets/s              -0.024535
Packet Length Max          -0.170442
Packet Length Mean         -0.099794
Packet Length Std          -0.160214
Packet Length Variance     -0.152197
Avg Packet Size            -0.068930
Avg Fwd Segment Size       -0.096717
Avg Bwd Segment Size       -0.045374
Subflow Fwd Packets        -0.165506
Subflow Fwd Bytes          -0.146314
Subflow Bwd Packets        -0.113239
Subflow Bwd Bytes          -0.075916
Init Fwd Win Bytes         -0.073047
Init Bwd Win Bytes         -0.040549
Fwd Act Data Packets       -0.130984
Fwd Seg Size Min           -0.309198
attack_id                  -0.098165
Name: Flow IAT Min, dtype: float64
Row 17: Flow Duration               0.780641
Total Fwd Packets           0.396678
Total Backward Packets      0.272520
Fwd Packets Length Total    0.350136
Bwd Packets Length Total    0.261268
Fwd Packet Length Max       0.383010
Fwd Packet Length Mean      0.321079
Fwd Packet Length Std       0.333408
Bwd Packet Length Max       0.192159
Bwd Packet Length Mean      0.070620
Bwd Packet Length Std       0.189628
Flow Bytes/s               -0.142582
Flow Packets/s             -0.169590
Flow IAT Mean               0.448291
Flow IAT Std                0.554799
Flow IAT Max                0.418495
Flow IAT Min               -0.071933
Fwd IAT Total               1.000000
Fwd IAT Mean                0.642652
Fwd IAT Std                 0.319268
Fwd IAT Max                 0.460709
Fwd IAT Min                 0.004437
Bwd IAT Total               0.133061
Bwd IAT Mean               -0.059166
Bwd IAT Std                 0.248535
Bwd IAT Max                 0.211641
Bwd IAT Min                 0.050383
Fwd Header Length           0.375993
Bwd Header Length           0.283238
Fwd Packets/s              -0.169231
Bwd Packets/s              -0.181755
Packet Length Max           0.270836
Packet Length Mean          0.206450
Packet Length Std           0.215716
Packet Length Variance      0.206249
Avg Packet Size             0.178638
Avg Fwd Segment Size        0.321079
Avg Bwd Segment Size        0.070620
Subflow Fwd Packets         0.396678
Subflow Fwd Bytes           0.350136
Subflow Bwd Packets         0.272520
Subflow Bwd Bytes           0.261268
Init Fwd Win Bytes         -0.131350
Init Bwd Win Bytes          0.291423
Fwd Act Data Packets        0.296800
Fwd Seg Size Min            0.204756
attack_id                  -0.017367
Name: Fwd IAT Total, dtype: float64
Row 18: Flow Duration               0.518293
Total Fwd Packets           0.244505
Total Backward Packets      0.145574
Fwd Packets Length Total    0.293066
Bwd Packets Length Total    0.119108
Fwd Packet Length Max       0.364603
Fwd Packet Length Mean      0.314509
Fwd Packet Length Std       0.330904
Bwd Packet Length Max       0.110592
Bwd Packet Length Mean      0.029596
Bwd Packet Length Std       0.144140
Flow Bytes/s               -0.143920
Flow Packets/s             -0.160047
Flow IAT Mean               0.623333
Flow IAT Std                0.592206
Flow IAT Max                0.376253
Flow IAT Min               -0.072768
Fwd IAT Total               0.642652
Fwd IAT Mean                1.000000
Fwd IAT Std                 0.112909
Fwd IAT Max                 0.432296
Fwd IAT Min                -0.009545
Bwd IAT Total               0.059535
Bwd IAT Mean               -0.027191
Bwd IAT Std                 0.095992
Bwd IAT Max                 0.073118
Bwd IAT Min                 0.079706
Fwd Header Length           0.260631
Bwd Header Length           0.198780
Fwd Packets/s              -0.158858
Bwd Packets/s              -0.176949
Packet Length Max           0.224164
Packet Length Mean          0.176049
Packet Length Std           0.208747
Packet Length Variance      0.175577
Avg Packet Size             0.158530
Avg Fwd Segment Size        0.314509
Avg Bwd Segment Size        0.029596
Subflow Fwd Packets         0.244505
Subflow Fwd Bytes           0.293066
Subflow Bwd Packets         0.145574
Subflow Bwd Bytes           0.119108
Init Fwd Win Bytes         -0.124701
Init Bwd Win Bytes          0.104470
Fwd Act Data Packets        0.180328
Fwd Seg Size Min            0.221853
attack_id                  -0.019111
Name: Fwd IAT Mean, dtype: float64
Row 19: Flow Duration               0.262062
Total Fwd Packets           0.526483
Total Backward Packets      0.513079
Fwd Packets Length Total    0.413410
Bwd Packets Length Total    0.606764
Fwd Packet Length Max       0.324852
Fwd Packet Length Mean      0.285779
Fwd Packet Length Std       0.229975
Bwd Packet Length Max       0.400185
Bwd Packet Length Mean      0.257838
Bwd Packet Length Std       0.315058
Flow Bytes/s               -0.069871
Flow Packets/s             -0.110826
Flow IAT Mean               0.024000
Flow IAT Std                0.082205
Flow IAT Max               -0.023027
Flow IAT Min               -0.031436
Fwd IAT Total               0.319268
Fwd IAT Mean                0.112909
Fwd IAT Std                 1.000000
Fwd IAT Max                 0.005651
Fwd IAT Min                 0.066522
Bwd IAT Total               0.502530
Bwd IAT Mean               -0.050013
Bwd IAT Std                 0.594384
Bwd IAT Max                 0.583120
Bwd IAT Min                -0.062677
Fwd Header Length           0.402424
Bwd Header Length           0.395598
Fwd Packets/s              -0.110018
Bwd Packets/s              -0.098129
Packet Length Max           0.345060
Packet Length Mean          0.272211
Packet Length Std           0.239862
Packet Length Variance      0.250222
Avg Packet Size             0.228855
Avg Fwd Segment Size        0.285779
Avg Bwd Segment Size        0.257838
Subflow Fwd Packets         0.526483
Subflow Fwd Bytes           0.413410
Subflow Bwd Packets         0.513079
Subflow Bwd Bytes           0.606764
Init Fwd Win Bytes         -0.037134
Init Bwd Win Bytes          0.699467
Fwd Act Data Packets        0.485163
Fwd Seg Size Min           -0.002423
attack_id                  -0.121112
Name: Fwd IAT Std, dtype: float64
Row 20: Flow Duration               0.359788
Total Fwd Packets           0.225066
Total Backward Packets      0.106457
Fwd Packets Length Total    0.292654
Bwd Packets Length Total    0.145319
Fwd Packet Length Max       0.323373
Fwd Packet Length Mean      0.178340
Fwd Packet Length Std       0.270654
Bwd Packet Length Max       0.303202
Bwd Packet Length Mean      0.145865
Bwd Packet Length Std       0.304261
Flow Bytes/s               -0.155532
Flow Packets/s             -0.171152
Flow IAT Mean               0.401846
Flow IAT Std                0.309412
Flow IAT Max                0.889802
Flow IAT Min               -0.081323
Fwd IAT Total               0.460709
Fwd IAT Mean                0.432296
Fwd IAT Std                 0.005651
Fwd IAT Max                 1.000000
Fwd IAT Min                 0.101149
Bwd IAT Total              -0.045330
Bwd IAT Mean                0.486238
Bwd IAT Std                -0.004700
Bwd IAT Max                -0.043031
Bwd IAT Min                 0.095182
Fwd Header Length           0.352172
Bwd Header Length           0.215404
Fwd Packets/s              -0.171523
Bwd Packets/s              -0.194523
Packet Length Max           0.340404
Packet Length Mean          0.205559
Packet Length Std           0.315372
Packet Length Variance      0.253050
Avg Packet Size             0.175532
Avg Fwd Segment Size        0.178340
Avg Bwd Segment Size        0.145865
Subflow Fwd Packets         0.225066
Subflow Fwd Bytes           0.292654
Subflow Bwd Packets         0.106457
Subflow Bwd Bytes           0.145319
Init Fwd Win Bytes         -0.092624
Init Bwd Win Bytes          0.029922
Fwd Act Data Packets        0.221819
Fwd Seg Size Min            0.178686
attack_id                  -0.085004
Name: Fwd IAT Max, dtype: float64
Row 21: Flow Duration               0.042262
Total Fwd Packets           0.121484
Total Backward Packets      0.181549
Fwd Packets Length Total    0.101196
Bwd Packets Length Total    0.152866
Fwd Packet Length Max       0.067656
Fwd Packet Length Mean      0.065737
Fwd Packet Length Std       0.093723
Bwd Packet Length Max       0.239714
Bwd Packet Length Mean      0.143203
Bwd Packet Length Std       0.230290
Flow Bytes/s                0.036841
Flow Packets/s              0.025983
Flow IAT Mean               0.032743
Flow IAT Std               -0.014954
Flow IAT Max                0.120750
Flow IAT Min                0.038207
Fwd IAT Total               0.004437
Fwd IAT Mean               -0.009545
Fwd IAT Std                 0.066522
Fwd IAT Max                 0.101149
Fwd IAT Min                 1.000000
Bwd IAT Total               0.056121
Bwd IAT Mean                0.153464
Bwd IAT Std                 0.025600
Bwd IAT Max                 0.039186
Bwd IAT Min                 0.007819
Fwd Header Length           0.159111
Bwd Header Length           0.217293
Fwd Packets/s               0.014289
Bwd Packets/s              -0.013465
Packet Length Max           0.192029
Packet Length Mean          0.118852
Packet Length Std           0.184351
Packet Length Variance      0.146953
Avg Packet Size             0.094682
Avg Fwd Segment Size        0.065737
Avg Bwd Segment Size        0.143203
Subflow Fwd Packets         0.121484
Subflow Fwd Bytes           0.101196
Subflow Bwd Packets         0.181549
Subflow Bwd Bytes           0.152866
Init Fwd Win Bytes          0.077115
Init Bwd Win Bytes         -0.004198
Fwd Act Data Packets        0.112142
Fwd Seg Size Min            0.057986
attack_id                   0.108564
Name: Fwd IAT Min, dtype: float64
Row 22: Flow Duration               0.124080
Total Fwd Packets           0.324204
Total Backward Packets      0.476214
Fwd Packets Length Total    0.259034
Bwd Packets Length Total    0.539414
Fwd Packet Length Max       0.198299
Fwd Packet Length Mean      0.203509
Fwd Packet Length Std       0.160607
Bwd Packet Length Max       0.388287
Bwd Packet Length Mean      0.314509
Bwd Packet Length Std       0.356071
Flow Bytes/s               -0.065239
Flow Packets/s             -0.117358
Flow IAT Mean               0.011753
Flow IAT Std                0.123286
Flow IAT Max               -0.048240
Flow IAT Min               -0.053105
Fwd IAT Total               0.133061
Fwd IAT Mean                0.059535
Fwd IAT Std                 0.502530
Fwd IAT Max                -0.045330
Fwd IAT Min                 0.056121
Bwd IAT Total               1.000000
Bwd IAT Mean               -0.070098
Bwd IAT Std                 0.587980
Bwd IAT Max                 0.745204
Bwd IAT Min                -0.031971
Fwd Header Length           0.248800
Bwd Header Length           0.367119
Fwd Packets/s              -0.119275
Bwd Packets/s              -0.090444
Packet Length Max           0.340607
Packet Length Mean          0.296223
Packet Length Std           0.284634
Packet Length Variance      0.320536
Avg Packet Size             0.266127
Avg Fwd Segment Size        0.203509
Avg Bwd Segment Size        0.314509
Subflow Fwd Packets         0.324204
Subflow Fwd Bytes           0.259034
Subflow Bwd Packets         0.476214
Subflow Bwd Bytes           0.539414
Init Fwd Win Bytes         -0.045929
Init Bwd Win Bytes          0.470894
Fwd Act Data Packets        0.332644
Fwd Seg Size Min            0.017300
attack_id                   0.046394
Name: Bwd IAT Total, dtype: float64
Row 23: Flow Duration               0.020241
Total Fwd Packets           0.192164
Total Backward Packets      0.163481
Fwd Packets Length Total    0.183088
Bwd Packets Length Total    0.119134
Fwd Packet Length Max       0.113974
Fwd Packet Length Mean      0.018704
Fwd Packet Length Std       0.077178
Bwd Packet Length Max       0.308447
Bwd Packet Length Mean      0.216200
Bwd Packet Length Std       0.264339
Flow Bytes/s               -0.143259
Flow Packets/s             -0.147376
Flow IAT Mean               0.117046
Flow IAT Std               -0.064555
Flow IAT Max                0.500898
Flow IAT Min               -0.054524
Fwd IAT Total              -0.059166
Fwd IAT Mean               -0.027191
Fwd IAT Std                -0.050013
Fwd IAT Max                 0.486238
Fwd IAT Min                 0.153464
Bwd IAT Total              -0.070098
Bwd IAT Mean                1.000000
Bwd IAT Std                -0.066464
Bwd IAT Max                -0.062794
Bwd IAT Min                 0.060752
Fwd Header Length           0.344856
Bwd Header Length           0.306564
Fwd Packets/s              -0.147670
Bwd Packets/s              -0.171037
Packet Length Max           0.267974
Packet Length Mean          0.169775
Packet Length Std           0.228484
Packet Length Variance      0.204808
Avg Packet Size             0.136195
Avg Fwd Segment Size        0.018704
Avg Bwd Segment Size        0.216200
Subflow Fwd Packets         0.192164
Subflow Fwd Bytes           0.183088
Subflow Bwd Packets         0.163481
Subflow Bwd Bytes           0.119134
Init Fwd Win Bytes         -0.069465
Init Bwd Win Bytes         -0.046640
Fwd Act Data Packets        0.291875
Fwd Seg Size Min            0.057588
attack_id                  -0.032041
Name: Bwd IAT Mean, dtype: float64
Row 24: Flow Duration               0.206601
Total Fwd Packets           0.448569
Total Backward Packets      0.512708
Fwd Packets Length Total    0.339885
Bwd Packets Length Total    0.601234
Fwd Packet Length Max       0.289248
Fwd Packet Length Mean      0.242220
Fwd Packet Length Std       0.221190
Bwd Packet Length Max       0.415092
Bwd Packet Length Mean      0.319511
Bwd Packet Length Std       0.361693
Flow Bytes/s               -0.055435
Flow Packets/s             -0.115618
Flow IAT Mean               0.014825
Flow IAT Std                0.127745
Flow IAT Max               -0.027571
Flow IAT Min               -0.055326
Fwd IAT Total               0.248535
Fwd IAT Mean                0.095992
Fwd IAT Std                 0.594384
Fwd IAT Max                -0.004700
Fwd IAT Min                 0.025600
Bwd IAT Total               0.587980
Bwd IAT Mean               -0.066464
Bwd IAT Std                 1.000000
Bwd IAT Max                 0.722582
Bwd IAT Min                -0.017976
Fwd Header Length           0.360276
Bwd Header Length           0.409684
Fwd Packets/s              -0.117894
Bwd Packets/s              -0.098808
Packet Length Max           0.382150
Packet Length Mean          0.309635
Packet Length Std           0.302866
Packet Length Variance      0.323665
Avg Packet Size             0.277124
Avg Fwd Segment Size        0.242220
Avg Bwd Segment Size        0.319511
Subflow Fwd Packets         0.448569
Subflow Fwd Bytes           0.339885
Subflow Bwd Packets         0.512708
Subflow Bwd Bytes           0.601234
Init Fwd Win Bytes         -0.068592
Init Bwd Win Bytes          0.558057
Fwd Act Data Packets        0.408957
Fwd Seg Size Min            0.029104
attack_id                   0.013587
Name: Bwd IAT Std, dtype: float64
Row 25: Flow Duration               0.209195
Total Fwd Packets           0.410429
Total Backward Packets      0.521269
Fwd Packets Length Total    0.281791
Bwd Packets Length Total    0.637725
Fwd Packet Length Max       0.215992
Fwd Packet Length Mean      0.166386
Fwd Packet Length Std       0.154433
Bwd Packet Length Max       0.441702
Bwd Packet Length Mean      0.364072
Bwd Packet Length Std       0.404445
Flow Bytes/s               -0.083577
Flow Packets/s             -0.133601
Flow IAT Mean               0.035911
Flow IAT Std                0.167128
Flow IAT Max               -0.035833
Flow IAT Min               -0.056652
Fwd IAT Total               0.211641
Fwd IAT Mean                0.073118
Fwd IAT Std                 0.583120
Fwd IAT Max                -0.043031
Fwd IAT Min                 0.039186
Bwd IAT Total               0.745204
Bwd IAT Mean               -0.062794
Bwd IAT Std                 0.722582
Bwd IAT Max                 1.000000
Bwd IAT Min                -0.033679
Fwd Header Length           0.321280
Bwd Header Length           0.406998
Fwd Packets/s              -0.135456
Bwd Packets/s              -0.116905
Packet Length Max           0.390923
Packet Length Mean          0.310833
Packet Length Std           0.318395
Packet Length Variance      0.365282
Avg Packet Size             0.279261
Avg Fwd Segment Size        0.166386
Avg Bwd Segment Size        0.364072
Subflow Fwd Packets         0.410429
Subflow Fwd Bytes           0.281791
Subflow Bwd Packets         0.521269
Subflow Bwd Bytes           0.637725
Init Fwd Win Bytes         -0.070402
Init Bwd Win Bytes          0.513751
Fwd Act Data Packets        0.377197
Fwd Seg Size Min            0.011774
attack_id                   0.098360
Name: Bwd IAT Max, dtype: float64
Row 26: Flow Duration               0.110107
Total Fwd Packets           0.098966
Total Backward Packets      0.075412
Fwd Packets Length Total    0.159474
Bwd Packets Length Total    0.043402
Fwd Packet Length Max       0.317312
Fwd Packet Length Mean      0.315777
Fwd Packet Length Std       0.378219
Bwd Packet Length Max       0.021272
Bwd Packet Length Mean      0.069716
Bwd Packet Length Std       0.080120
Flow Bytes/s               -0.052498
Flow Packets/s             -0.069277
Flow IAT Mean               0.095276
Flow IAT Std                0.168080
Flow IAT Max                0.136607
Flow IAT Min               -0.050071
Fwd IAT Total               0.050383
Fwd IAT Mean                0.079706
Fwd IAT Std                -0.062677
Fwd IAT Max                 0.095182
Fwd IAT Min                 0.007819
Bwd IAT Total              -0.031971
Bwd IAT Mean                0.060752
Bwd IAT Std                -0.017976
Bwd IAT Max                -0.033679
Bwd IAT Min                 1.000000
Fwd Header Length           0.161279
Bwd Header Length           0.094064
Fwd Packets/s              -0.072123
Bwd Packets/s              -0.068265
Packet Length Max           0.165264
Packet Length Mean          0.168417
Packet Length Std           0.189050
Packet Length Variance      0.198223
Avg Packet Size             0.163039
Avg Fwd Segment Size        0.315777
Avg Bwd Segment Size        0.069716
Subflow Fwd Packets         0.098966
Subflow Fwd Bytes           0.159474
Subflow Bwd Packets         0.075412
Subflow Bwd Bytes           0.043402
Init Fwd Win Bytes          0.246999
Init Bwd Win Bytes          0.064453
Fwd Act Data Packets       -0.009818
Fwd Seg Size Min            0.107613
attack_id                  -0.056871
Name: Bwd IAT Min, dtype: float64
Row 27: Flow Duration               0.330307
Total Fwd Packets           0.806061
Total Backward Packets      0.615261
Fwd Packets Length Total    0.570863
Bwd Packets Length Total    0.488606
Fwd Packet Length Max       0.580049
Fwd Packet Length Mean      0.467705
Fwd Packet Length Std       0.530178
Bwd Packet Length Max       0.566809
Bwd Packet Length Mean      0.398443
Bwd Packet Length Std       0.534625
Flow Bytes/s               -0.082406
Flow Packets/s             -0.153481
Flow IAT Mean               0.175209
Flow IAT Std                0.267877
Flow IAT Max                0.316224
Flow IAT Min               -0.202540
Fwd IAT Total               0.375993
Fwd IAT Mean                0.260631
Fwd IAT Std                 0.402424
Fwd IAT Max                 0.352172
Fwd IAT Min                 0.159111
Bwd IAT Total               0.248800
Bwd IAT Mean                0.344856
Bwd IAT Std                 0.360276
Bwd IAT Max                 0.321280
Bwd IAT Min                 0.161279
Fwd Header Length           1.000000
Bwd Header Length           0.724390
Fwd Packets/s              -0.155406
Bwd Packets/s              -0.148240
Packet Length Max           0.621847
Packet Length Mean          0.500767
Packet Length Std           0.539616
Packet Length Variance      0.500078
Avg Packet Size             0.443692
Avg Fwd Segment Size        0.467705
Avg Bwd Segment Size        0.398443
Subflow Fwd Packets         0.806061
Subflow Fwd Bytes           0.570863
Subflow Bwd Packets         0.615261
Subflow Bwd Bytes           0.488606
Init Fwd Win Bytes          0.013206
Init Bwd Win Bytes          0.424312
Fwd Act Data Packets        0.667525
Fwd Seg Size Min            0.327892
attack_id                  -0.021366
Name: Fwd Header Length, dtype: float64
Row 28: Flow Duration               0.294374
Total Fwd Packets           0.711120
Total Backward Packets      0.846017
Fwd Packets Length Total    0.489739
Bwd Packets Length Total    0.559730
Fwd Packet Length Max       0.468355
Fwd Packet Length Mean      0.462312
Fwd Packet Length Std       0.456745
Bwd Packet Length Max       0.638425
Bwd Packet Length Mean      0.530512
Bwd Packet Length Std       0.606914
Flow Bytes/s                0.010898
Flow Packets/s             -0.089666
Flow IAT Mean               0.155082
Flow IAT Std                0.241152
Flow IAT Max                0.231894
Flow IAT Min               -0.169858
Fwd IAT Total               0.283238
Fwd IAT Mean                0.198780
Fwd IAT Std                 0.395598
Fwd IAT Max                 0.215404
Fwd IAT Min                 0.217293
Bwd IAT Total               0.367119
Bwd IAT Mean                0.306564
Bwd IAT Std                 0.409684
Bwd IAT Max                 0.406998
Bwd IAT Min                 0.094064
Fwd Header Length           0.724390
Bwd Header Length           1.000000
Fwd Packets/s              -0.108921
Bwd Packets/s              -0.065203
Packet Length Max           0.630535
Packet Length Mean          0.579675
Packet Length Std           0.589688
Packet Length Variance      0.504483
Avg Packet Size             0.538469
Avg Fwd Segment Size        0.462312
Avg Bwd Segment Size        0.530512
Subflow Fwd Packets         0.711120
Subflow Fwd Bytes           0.489739
Subflow Bwd Packets         0.846017
Subflow Bwd Bytes           0.559730
Init Fwd Win Bytes         -0.039221
Init Bwd Win Bytes          0.380560
Fwd Act Data Packets        0.616584
Fwd Seg Size Min            0.264664
attack_id                  -0.029029
Name: Bwd Header Length, dtype: float64
Row 29: Flow Duration              -0.201649
Total Fwd Packets          -0.141910
Total Backward Packets     -0.068510
Fwd Packets Length Total   -0.072082
Bwd Packets Length Total   -0.064800
Fwd Packet Length Max      -0.024095
Fwd Packet Length Mean      0.065484
Fwd Packet Length Std       0.035362
Bwd Packet Length Max      -0.079164
Bwd Packet Length Mean      0.011930
Bwd Packet Length Std      -0.060242
Flow Bytes/s                0.382884
Flow Packets/s              0.835874
Flow IAT Mean              -0.193202
Flow IAT Std               -0.159956
Flow IAT Max               -0.196603
Flow IAT Min               -0.060445
Fwd IAT Total              -0.169231
Fwd IAT Mean               -0.158858
Fwd IAT Std                -0.110018
Fwd IAT Max                -0.171523
Fwd IAT Min                 0.014289
Bwd IAT Total              -0.119275
Bwd IAT Mean               -0.147670
Bwd IAT Std                -0.117894
Bwd IAT Max                -0.135456
Bwd IAT Min                -0.072123
Fwd Header Length          -0.155406
Bwd Header Length          -0.108921
Fwd Packets/s               1.000000
Bwd Packets/s               0.276758
Packet Length Max          -0.059588
Packet Length Mean          0.004687
Packet Length Std          -0.027922
Packet Length Variance     -0.020036
Avg Packet Size             0.029423
Avg Fwd Segment Size        0.065484
Avg Bwd Segment Size        0.011930
Subflow Fwd Packets        -0.141910
Subflow Fwd Bytes          -0.072082
Subflow Bwd Packets        -0.068510
Subflow Bwd Bytes          -0.064800
Init Fwd Win Bytes          0.188806
Init Bwd Win Bytes         -0.073528
Fwd Act Data Packets       -0.143159
Fwd Seg Size Min           -0.127855
attack_id                   0.037160
Name: Fwd Packets/s, dtype: float64
Row 30: Flow Duration              -0.215691
Total Fwd Packets          -0.126428
Total Backward Packets     -0.013203
Fwd Packets Length Total   -0.022332
Bwd Packets Length Total   -0.046593
Fwd Packet Length Max      -0.034213
Fwd Packet Length Mean      0.053065
Fwd Packet Length Std      -0.014289
Bwd Packet Length Max      -0.015688
Bwd Packet Length Mean      0.105039
Bwd Packet Length Std      -0.021202
Flow Bytes/s                0.383061
Flow Packets/s              0.258343
Flow IAT Mean              -0.213609
Flow IAT Std               -0.171509
Flow IAT Max               -0.220567
Flow IAT Min               -0.024535
Fwd IAT Total              -0.181755
Fwd IAT Mean               -0.176949
Fwd IAT Std                -0.098129
Fwd IAT Max                -0.194523
Fwd IAT Min                -0.013465
Bwd IAT Total              -0.090444
Bwd IAT Mean               -0.171037
Bwd IAT Std                -0.098808
Bwd IAT Max                -0.116905
Bwd IAT Min                -0.068265
Fwd Header Length          -0.148240
Bwd Header Length          -0.065203
Fwd Packets/s               0.276758
Bwd Packets/s               1.000000
Packet Length Max          -0.021899
Packet Length Mean          0.088190
Packet Length Std           0.009359
Packet Length Variance     -0.024543
Avg Packet Size             0.116140
Avg Fwd Segment Size        0.053065
Avg Bwd Segment Size        0.105039
Subflow Fwd Packets        -0.126428
Subflow Fwd Bytes          -0.022332
Subflow Bwd Packets        -0.013203
Subflow Bwd Bytes          -0.046593
Init Fwd Win Bytes          0.130409
Init Bwd Win Bytes         -0.069479
Fwd Act Data Packets       -0.095185
Fwd Seg Size Min           -0.072828
attack_id                  -0.080766
Name: Bwd Packets/s, dtype: float64
Row 31: Flow Duration               0.244895
Total Fwd Packets           0.591023
Total Backward Packets      0.596776
Fwd Packets Length Total    0.680678
Bwd Packets Length Total    0.601459
Fwd Packet Length Max       0.693306
Fwd Packet Length Mean      0.611326
Fwd Packet Length Std       0.634520
Bwd Packet Length Max       0.916820
Bwd Packet Length Mean      0.702215
Bwd Packet Length Std       0.866421
Flow Bytes/s                0.031106
Flow Packets/s             -0.048260
Flow IAT Mean               0.185374
Flow IAT Std                0.286885
Flow IAT Max                0.313121
Flow IAT Min               -0.170442
Fwd IAT Total               0.270836
Fwd IAT Mean                0.224164
Fwd IAT Std                 0.345060
Fwd IAT Max                 0.340404
Fwd IAT Min                 0.192029
Bwd IAT Total               0.340607
Bwd IAT Mean                0.267974
Bwd IAT Std                 0.382150
Bwd IAT Max                 0.390923
Bwd IAT Min                 0.165264
Fwd Header Length           0.621847
Bwd Header Length           0.630535
Fwd Packets/s              -0.059588
Bwd Packets/s              -0.021899
Packet Length Max           1.000000
Packet Length Mean          0.817534
Packet Length Std           0.914118
Packet Length Variance      0.770931
Avg Packet Size             0.778213
Avg Fwd Segment Size        0.611326
Avg Bwd Segment Size        0.702215
Subflow Fwd Packets         0.591023
Subflow Fwd Bytes           0.680678
Subflow Bwd Packets         0.596776
Subflow Bwd Bytes           0.601459
Init Fwd Win Bytes          0.171932
Init Bwd Win Bytes          0.393128
Fwd Act Data Packets        0.558498
Fwd Seg Size Min            0.154635
attack_id                  -0.061398
Name: Packet Length Max, dtype: float64
Row 32: Flow Duration               0.160033
Total Fwd Packets           0.499634
Total Backward Packets      0.570523
Fwd Packets Length Total    0.577650
Bwd Packets Length Total    0.530830
Fwd Packet Length Max       0.624118
Fwd Packet Length Mean      0.666252
Fwd Packet Length Std       0.595388
Bwd Packet Length Max       0.736857
Bwd Packet Length Mean      0.808163
Bwd Packet Length Std       0.730641
Flow Bytes/s                0.093525
Flow Packets/s              0.015124
Flow IAT Mean               0.113642
Flow IAT Std                0.250089
Flow IAT Max                0.168161
Flow IAT Min               -0.099794
Fwd IAT Total               0.206450
Fwd IAT Mean                0.176049
Fwd IAT Std                 0.272211
Fwd IAT Max                 0.205559
Fwd IAT Min                 0.118852
Bwd IAT Total               0.296223
Bwd IAT Mean                0.169775
Bwd IAT Std                 0.309635
Bwd IAT Max                 0.310833
Bwd IAT Min                 0.168417
Fwd Header Length           0.500767
Bwd Header Length           0.579675
Fwd Packets/s               0.004687
Bwd Packets/s               0.088190
Packet Length Max           0.817534
Packet Length Mean          1.000000
Packet Length Std           0.778409
Packet Length Variance      0.731589
Avg Packet Size             0.965009
Avg Fwd Segment Size        0.666252
Avg Bwd Segment Size        0.808163
Subflow Fwd Packets         0.499634
Subflow Fwd Bytes           0.577650
Subflow Bwd Packets         0.570523
Subflow Bwd Bytes           0.530830
Init Fwd Win Bytes          0.183537
Init Bwd Win Bytes          0.320431
Fwd Act Data Packets        0.481511
Fwd Seg Size Min            0.099643
attack_id                  -0.098992
Name: Packet Length Mean, dtype: float64
Row 33: Flow Duration               0.211278
Total Fwd Packets           0.500338
Total Backward Packets      0.552656
Fwd Packets Length Total    0.554814
Bwd Packets Length Total    0.485171
Fwd Packet Length Max       0.605821
Fwd Packet Length Mean      0.567894
Fwd Packet Length Std       0.610741
Bwd Packet Length Max       0.850145
Bwd Packet Length Mean      0.682757
Bwd Packet Length Std       0.886948
Flow Bytes/s                0.062113
Flow Packets/s             -0.015276
Flow IAT Mean               0.190332
Flow IAT Std                0.261937
Flow IAT Max                0.305920
Flow IAT Min               -0.160214
Fwd IAT Total               0.215716
Fwd IAT Mean                0.208747
Fwd IAT Std                 0.239862
Fwd IAT Max                 0.315372
Fwd IAT Min                 0.184351
Bwd IAT Total               0.284634
Bwd IAT Mean                0.228484
Bwd IAT Std                 0.302866
Bwd IAT Max                 0.318395
Bwd IAT Min                 0.189050
Fwd Header Length           0.539616
Bwd Header Length           0.589688
Fwd Packets/s              -0.027922
Bwd Packets/s               0.009359
Packet Length Max           0.914118
Packet Length Mean          0.778409
Packet Length Std           1.000000
Packet Length Variance      0.693396
Avg Packet Size             0.764796
Avg Fwd Segment Size        0.567894
Avg Bwd Segment Size        0.682757
Subflow Fwd Packets         0.500338
Subflow Fwd Bytes           0.554814
Subflow Bwd Packets         0.552656
Subflow Bwd Bytes           0.485171
Init Fwd Win Bytes          0.201096
Init Bwd Win Bytes          0.287666
Fwd Act Data Packets        0.470960
Fwd Seg Size Min            0.152382
attack_id                  -0.097046
Name: Packet Length Std, dtype: float64
Row 34: Flow Duration               0.183152
Total Fwd Packets           0.456418
Total Backward Packets      0.457982
Fwd Packets Length Total    0.514811
Bwd Packets Length Total    0.521250
Fwd Packet Length Max       0.610889
Fwd Packet Length Mean      0.576243
Fwd Packet Length Std       0.627071
Bwd Packet Length Max       0.678489
Bwd Packet Length Mean      0.724085
Bwd Packet Length Std       0.685478
Flow Bytes/s               -0.019063
Flow Packets/s             -0.014598
Flow IAT Mean               0.115945
Flow IAT Std                0.259776
Flow IAT Max                0.227996
Flow IAT Min               -0.152197
Fwd IAT Total               0.206249
Fwd IAT Mean                0.175577
Fwd IAT Std                 0.250222
Fwd IAT Max                 0.253050
Fwd IAT Min                 0.146953
Bwd IAT Total               0.320536
Bwd IAT Mean                0.204808
Bwd IAT Std                 0.323665
Bwd IAT Max                 0.365282
Bwd IAT Min                 0.198223
Fwd Header Length           0.500078
Bwd Header Length           0.504483
Fwd Packets/s              -0.020036
Bwd Packets/s              -0.024543
Packet Length Max           0.770931
Packet Length Mean          0.731589
Packet Length Std           0.693396
Packet Length Variance      1.000000
Avg Packet Size             0.678704
Avg Fwd Segment Size        0.576243
Avg Bwd Segment Size        0.724085
Subflow Fwd Packets         0.456418
Subflow Fwd Bytes           0.514811
Subflow Bwd Packets         0.457982
Subflow Bwd Bytes           0.521250
Init Fwd Win Bytes          0.236211
Init Bwd Win Bytes          0.344496
Fwd Act Data Packets        0.378085
Fwd Seg Size Min            0.172997
attack_id                  -0.023701
Name: Packet Length Variance, dtype: float64
Row 35: Flow Duration               0.131517
Total Fwd Packets           0.444920
Total Backward Packets      0.537342
Fwd Packets Length Total    0.535458
Bwd Packets Length Total    0.484636
Fwd Packet Length Max       0.592635
Fwd Packet Length Mean      0.656824
Fwd Packet Length Std       0.563593
Bwd Packet Length Max       0.695708
Bwd Packet Length Mean      0.801836
Bwd Packet Length Std       0.701522
Flow Bytes/s                0.107747
Flow Packets/s              0.038576
Flow IAT Mean               0.099798
Flow IAT Std                0.238214
Flow IAT Max                0.138955
Flow IAT Min               -0.068930
Fwd IAT Total               0.178638
Fwd IAT Mean                0.158530
Fwd IAT Std                 0.228855
Fwd IAT Max                 0.175532
Fwd IAT Min                 0.094682
Bwd IAT Total               0.266127
Bwd IAT Mean                0.136195
Bwd IAT Std                 0.277124
Bwd IAT Max                 0.279261
Bwd IAT Min                 0.163039
Fwd Header Length           0.443692
Bwd Header Length           0.538469
Fwd Packets/s               0.029423
Bwd Packets/s               0.116140
Packet Length Max           0.778213
Packet Length Mean          0.965009
Packet Length Std           0.764796
Packet Length Variance      0.678704
Avg Packet Size             1.000000
Avg Fwd Segment Size        0.656824
Avg Bwd Segment Size        0.801836
Subflow Fwd Packets         0.444920
Subflow Fwd Bytes           0.535458
Subflow Bwd Packets         0.537342
Subflow Bwd Bytes           0.484636
Init Fwd Win Bytes          0.178445
Init Bwd Win Bytes          0.274867
Fwd Act Data Packets        0.434184
Fwd Seg Size Min            0.058608
attack_id                  -0.103647
Name: Avg Packet Size, dtype: float64
Row 36: Flow Duration               0.223883
Total Fwd Packets           0.443021
Total Backward Packets      0.454429
Fwd Packets Length Total    0.673515
Bwd Packets Length Total    0.433777
Fwd Packet Length Max       0.876929
Fwd Packet Length Mean      1.000000
Fwd Packet Length Std       0.906266
Bwd Packet Length Max       0.389350
Bwd Packet Length Mean      0.430169
Bwd Packet Length Std       0.406844
Flow Bytes/s                0.088291
Flow Packets/s              0.079698
Flow IAT Mean               0.132827
Flow IAT Std                0.350501
Flow IAT Max                0.116476
Flow IAT Min               -0.096717
Fwd IAT Total               0.321079
Fwd IAT Mean                0.314509
Fwd IAT Std                 0.285779
Fwd IAT Max                 0.178340
Fwd IAT Min                 0.065737
Bwd IAT Total               0.203509
Bwd IAT Mean                0.018704
Bwd IAT Std                 0.242220
Bwd IAT Max                 0.166386
Bwd IAT Min                 0.315777
Fwd Header Length           0.467705
Bwd Header Length           0.462312
Fwd Packets/s               0.065484
Bwd Packets/s               0.053065
Packet Length Max           0.611326
Packet Length Mean          0.666252
Packet Length Std           0.567894
Packet Length Variance      0.576243
Avg Packet Size             0.656824
Avg Fwd Segment Size        1.000000
Avg Bwd Segment Size        0.430169
Subflow Fwd Packets         0.443021
Subflow Fwd Bytes           0.673515
Subflow Bwd Packets         0.454429
Subflow Bwd Bytes           0.433777
Init Fwd Win Bytes          0.261404
Init Bwd Win Bytes          0.415983
Fwd Act Data Packets        0.323995
Fwd Seg Size Min            0.226251
attack_id                  -0.196100
Name: Avg Fwd Segment Size, dtype: float64
Row 37: Flow Duration               0.069250
Total Fwd Packets           0.389019
Total Backward Packets      0.530979
Fwd Packets Length Total    0.397051
Bwd Packets Length Total    0.556433
Fwd Packet Length Max       0.387529
Fwd Packet Length Mean      0.430169
Fwd Packet Length Std       0.376194
Bwd Packet Length Max       0.745937
Bwd Packet Length Mean      1.000000
Bwd Packet Length Std       0.713763
Flow Bytes/s                0.059257
Flow Packets/s              0.017235
Flow IAT Mean               0.037822
Flow IAT Std                0.079935
Flow IAT Max                0.132577
Flow IAT Min               -0.045374
Fwd IAT Total               0.070620
Fwd IAT Mean                0.029596
Fwd IAT Std                 0.257838
Fwd IAT Max                 0.145865
Fwd IAT Min                 0.143203
Bwd IAT Total               0.314509
Bwd IAT Mean                0.216200
Bwd IAT Std                 0.319511
Bwd IAT Max                 0.364072
Bwd IAT Min                 0.069716
Fwd Header Length           0.398443
Bwd Header Length           0.530512
Fwd Packets/s               0.011930
Bwd Packets/s               0.105039
Packet Length Max           0.702215
Packet Length Mean          0.808163
Packet Length Std           0.682757
Packet Length Variance      0.724085
Avg Packet Size             0.801836
Avg Fwd Segment Size        0.430169
Avg Bwd Segment Size        1.000000
Subflow Fwd Packets         0.389019
Subflow Fwd Bytes           0.397051
Subflow Bwd Packets         0.530979
Subflow Bwd Bytes           0.556433
Init Fwd Win Bytes          0.125144
Init Bwd Win Bytes          0.277818
Fwd Act Data Packets        0.415084
Fwd Seg Size Min            0.017201
attack_id                  -0.085765
Name: Avg Bwd Segment Size, dtype: float64
Row 38: Flow Duration               0.344572
Total Fwd Packets           1.000000
Total Backward Packets      0.693930
Fwd Packets Length Total    0.580110
Bwd Packets Length Total    0.536628
Fwd Packet Length Max       0.535619
Fwd Packet Length Mean      0.443021
Fwd Packet Length Std       0.460252
Bwd Packet Length Max       0.559189
Bwd Packet Length Mean      0.389019
Bwd Packet Length Std       0.526333
Flow Bytes/s               -0.056336
Flow Packets/s             -0.142290
Flow IAT Mean               0.155158
Flow IAT Std                0.229385
Flow IAT Max                0.198171
Flow IAT Min               -0.165506
Fwd IAT Total               0.396678
Fwd IAT Mean                0.244505
Fwd IAT Std                 0.526483
Fwd IAT Max                 0.225066
Fwd IAT Min                 0.121484
Bwd IAT Total               0.324204
Bwd IAT Mean                0.192164
Bwd IAT Std                 0.448569
Bwd IAT Max                 0.410429
Bwd IAT Min                 0.098966
Fwd Header Length           0.806061
Bwd Header Length           0.711120
Fwd Packets/s              -0.141910
Bwd Packets/s              -0.126428
Packet Length Max           0.591023
Packet Length Mean          0.499634
Packet Length Std           0.500338
Packet Length Variance      0.456418
Avg Packet Size             0.444920
Avg Fwd Segment Size        0.443021
Avg Bwd Segment Size        0.389019
Subflow Fwd Packets         1.000000
Subflow Fwd Bytes           0.580110
Subflow Bwd Packets         0.693930
Subflow Bwd Bytes           0.536628
Init Fwd Win Bytes          0.028631
Init Bwd Win Bytes          0.541543
Fwd Act Data Packets        0.755914
Fwd Seg Size Min            0.151001
attack_id                  -0.039465
Name: Subflow Fwd Packets, dtype: float64
Row 39: Flow Duration               0.266345
Total Fwd Packets           0.580110
Total Backward Packets      0.457674
Fwd Packets Length Total    1.000000
Bwd Packets Length Total    0.470164
Fwd Packet Length Max       0.796363
Fwd Packet Length Mean      0.673515
Fwd Packet Length Std       0.656346
Bwd Packet Length Max       0.547430
Bwd Packet Length Mean      0.397051
Bwd Packet Length Std       0.512406
Flow Bytes/s               -0.001175
Flow Packets/s             -0.068197
Flow IAT Mean               0.200397
Flow IAT Std                0.312305
Flow IAT Max                0.232288
Flow IAT Min               -0.146314
Fwd IAT Total               0.350136
Fwd IAT Mean                0.293066
Fwd IAT Std                 0.413410
Fwd IAT Max                 0.292654
Fwd IAT Min                 0.101196
Bwd IAT Total               0.259034
Bwd IAT Mean                0.183088
Bwd IAT Std                 0.339885
Bwd IAT Max                 0.281791
Bwd IAT Min                 0.159474
Fwd Header Length           0.570863
Bwd Header Length           0.489739
Fwd Packets/s              -0.072082
Bwd Packets/s              -0.022332
Packet Length Max           0.680678
Packet Length Mean          0.577650
Packet Length Std           0.554814
Packet Length Variance      0.514811
Avg Packet Size             0.535458
Avg Fwd Segment Size        0.673515
Avg Bwd Segment Size        0.397051
Subflow Fwd Packets         0.580110
Subflow Fwd Bytes           1.000000
Subflow Bwd Packets         0.457674
Subflow Bwd Bytes           0.470164
Init Fwd Win Bytes          0.148935
Init Bwd Win Bytes          0.477821
Fwd Act Data Packets        0.559723
Fwd Seg Size Min            0.200562
attack_id                  -0.085335
Name: Subflow Fwd Bytes, dtype: float64
Row 40: Flow Duration               0.284508
Total Fwd Packets           0.693930
Total Backward Packets      1.000000
Fwd Packets Length Total    0.457674
Bwd Packets Length Total    0.667482
Fwd Packet Length Max       0.431439
Fwd Packet Length Mean      0.454429
Fwd Packet Length Std       0.418142
Bwd Packet Length Max       0.626985
Bwd Packet Length Mean      0.530979
Bwd Packet Length Std       0.589303
Flow Bytes/s                0.053830
Flow Packets/s             -0.046278
Flow IAT Mean               0.098510
Flow IAT Std                0.192020
Flow IAT Max                0.129965
Flow IAT Min               -0.113239
Fwd IAT Total               0.272520
Fwd IAT Mean                0.145574
Fwd IAT Std                 0.513079
Fwd IAT Max                 0.106457
Fwd IAT Min                 0.181549
Bwd IAT Total               0.476214
Bwd IAT Mean                0.163481
Bwd IAT Std                 0.512708
Bwd IAT Max                 0.521269
Bwd IAT Min                 0.075412
Fwd Header Length           0.615261
Bwd Header Length           0.846017
Fwd Packets/s              -0.068510
Bwd Packets/s              -0.013203
Packet Length Max           0.596776
Packet Length Mean          0.570523
Packet Length Std           0.552656
Packet Length Variance      0.457982
Avg Packet Size             0.537342
Avg Fwd Segment Size        0.454429
Avg Bwd Segment Size        0.530979
Subflow Fwd Packets         0.693930
Subflow Fwd Bytes           0.457674
Subflow Bwd Packets         1.000000
Subflow Bwd Bytes           0.667482
Init Fwd Win Bytes         -0.018963
Init Bwd Win Bytes          0.501559
Fwd Act Data Packets        0.602988
Fwd Seg Size Min            0.124126
attack_id                  -0.064490
Name: Subflow Bwd Packets, dtype: float64
Row 41: Flow Duration               0.227557
Total Fwd Packets           0.536628
Total Backward Packets      0.667482
Fwd Packets Length Total    0.470164
Bwd Packets Length Total    1.000000
Fwd Packet Length Max       0.451904
Fwd Packet Length Mean      0.433777
Fwd Packet Length Std       0.401362
Bwd Packet Length Max       0.642817
Bwd Packet Length Mean      0.556433
Bwd Packet Length Std       0.560662
Flow Bytes/s               -0.043166
Flow Packets/s             -0.054143
Flow IAT Mean               0.038200
Flow IAT Std                0.142081
Flow IAT Max                0.117542
Flow IAT Min               -0.075916
Fwd IAT Total               0.261268
Fwd IAT Mean                0.119108
Fwd IAT Std                 0.606764
Fwd IAT Max                 0.145319
Fwd IAT Min                 0.152866
Bwd IAT Total               0.539414
Bwd IAT Mean                0.119134
Bwd IAT Std                 0.601234
Bwd IAT Max                 0.637725
Bwd IAT Min                 0.043402
Fwd Header Length           0.488606
Bwd Header Length           0.559730
Fwd Packets/s              -0.064800
Bwd Packets/s              -0.046593
Packet Length Max           0.601459
Packet Length Mean          0.530830
Packet Length Std           0.485171
Packet Length Variance      0.521250
Avg Packet Size             0.484636
Avg Fwd Segment Size        0.433777
Avg Bwd Segment Size        0.556433
Subflow Fwd Packets         0.536628
Subflow Fwd Bytes           0.470164
Subflow Bwd Packets         0.667482
Subflow Bwd Bytes           1.000000
Init Fwd Win Bytes          0.058459
Init Bwd Win Bytes          0.626598
Fwd Act Data Packets        0.495346
Fwd Seg Size Min            0.053893
attack_id                  -0.035668
Name: Subflow Bwd Bytes, dtype: float64
Row 42: Flow Duration              -0.136505
Total Fwd Packets           0.028631
Total Backward Packets     -0.018963
Fwd Packets Length Total    0.148935
Bwd Packets Length Total    0.058459
Fwd Packet Length Max       0.250035
Fwd Packet Length Mean      0.261404
Fwd Packet Length Std       0.313570
Bwd Packet Length Max       0.082989
Bwd Packet Length Mean      0.125144
Bwd Packet Length Std       0.169575
Flow Bytes/s                0.062303
Flow Packets/s              0.192189
Flow IAT Mean              -0.113184
Flow IAT Std               -0.153843
Flow IAT Max               -0.107938
Flow IAT Min               -0.073047
Fwd IAT Total              -0.131350
Fwd IAT Mean               -0.124701
Fwd IAT Std                -0.037134
Fwd IAT Max                -0.092624
Fwd IAT Min                 0.077115
Bwd IAT Total              -0.045929
Bwd IAT Mean               -0.069465
Bwd IAT Std                -0.068592
Bwd IAT Max                -0.070402
Bwd IAT Min                 0.246999
Fwd Header Length           0.013206
Bwd Header Length          -0.039221
Fwd Packets/s               0.188806
Bwd Packets/s               0.130409
Packet Length Max           0.171932
Packet Length Mean          0.183537
Packet Length Std           0.201096
Packet Length Variance      0.236211
Avg Packet Size             0.178445
Avg Fwd Segment Size        0.261404
Avg Bwd Segment Size        0.125144
Subflow Fwd Packets         0.028631
Subflow Fwd Bytes           0.148935
Subflow Bwd Packets        -0.018963
Subflow Bwd Bytes           0.058459
Init Fwd Win Bytes          1.000000
Init Bwd Win Bytes          0.152317
Fwd Act Data Packets       -0.036225
Fwd Seg Size Min            0.032016
attack_id                  -0.024570
Name: Init Fwd Win Bytes, dtype: float64
Row 43: Flow Duration               0.220855
Total Fwd Packets           0.541543
Total Backward Packets      0.501559
Fwd Packets Length Total    0.477821
Bwd Packets Length Total    0.626598
Fwd Packet Length Max       0.463152
Fwd Packet Length Mean      0.415983
Fwd Packet Length Std       0.390927
Bwd Packet Length Max       0.383273
Bwd Packet Length Mean      0.277818
Bwd Packet Length Std       0.315921
Flow Bytes/s               -0.084479
Flow Packets/s             -0.079840
Flow IAT Mean               0.024429
Flow IAT Std                0.022531
Flow IAT Max               -0.014380
Flow IAT Min               -0.040549
Fwd IAT Total               0.291423
Fwd IAT Mean                0.104470
Fwd IAT Std                 0.699467
Fwd IAT Max                 0.029922
Fwd IAT Min                -0.004198
Bwd IAT Total               0.470894
Bwd IAT Mean               -0.046640
Bwd IAT Std                 0.558057
Bwd IAT Max                 0.513751
Bwd IAT Min                 0.064453
Fwd Header Length           0.424312
Bwd Header Length           0.380560
Fwd Packets/s              -0.073528
Bwd Packets/s              -0.069479
Packet Length Max           0.393128
Packet Length Mean          0.320431
Packet Length Std           0.287666
Packet Length Variance      0.344496
Avg Packet Size             0.274867
Avg Fwd Segment Size        0.415983
Avg Bwd Segment Size        0.277818
Subflow Fwd Packets         0.541543
Subflow Fwd Bytes           0.477821
Subflow Bwd Packets         0.501559
Subflow Bwd Bytes           0.626598
Init Fwd Win Bytes          0.152317
Init Bwd Win Bytes          1.000000
Fwd Act Data Packets        0.467861
Fwd Seg Size Min           -0.034003
attack_id                  -0.188538
Name: Init Bwd Win Bytes, dtype: float64
Row 44: Flow Duration               0.245885
Total Fwd Packets           0.755914
Total Backward Packets      0.602988
Fwd Packets Length Total    0.559723
Bwd Packets Length Total    0.495346
Fwd Packet Length Max       0.397308
Fwd Packet Length Mean      0.323995
Fwd Packet Length Std       0.282448
Bwd Packet Length Max       0.585226
Bwd Packet Length Mean      0.415084
Bwd Packet Length Std       0.529967
Flow Bytes/s               -0.040516
Flow Packets/s             -0.139593
Flow IAT Mean               0.145694
Flow IAT Std                0.130074
Flow IAT Max                0.185398
Flow IAT Min               -0.130984
Fwd IAT Total               0.296800
Fwd IAT Mean                0.180328
Fwd IAT Std                 0.485163
Fwd IAT Max                 0.221819
Fwd IAT Min                 0.112142
Bwd IAT Total               0.332644
Bwd IAT Mean                0.291875
Bwd IAT Std                 0.408957
Bwd IAT Max                 0.377197
Bwd IAT Min                -0.009818
Fwd Header Length           0.667525
Bwd Header Length           0.616584
Fwd Packets/s              -0.143159
Bwd Packets/s              -0.095185
Packet Length Max           0.558498
Packet Length Mean          0.481511
Packet Length Std           0.470960
Packet Length Variance      0.378085
Avg Packet Size             0.434184
Avg Fwd Segment Size        0.323995
Avg Bwd Segment Size        0.415084
Subflow Fwd Packets         0.755914
Subflow Fwd Bytes           0.559723
Subflow Bwd Packets         0.602988
Subflow Bwd Bytes           0.495346
Init Fwd Win Bytes         -0.036225
Init Bwd Win Bytes          0.467861
Fwd Act Data Packets        1.000000
Fwd Seg Size Min            0.049872
attack_id                  -0.086071
Name: Fwd Act Data Packets, dtype: float64
Row 45: Flow Duration               0.172877
Total Fwd Packets           0.151001
Total Backward Packets      0.124126
Fwd Packets Length Total    0.200562
Bwd Packets Length Total    0.053893
Fwd Packet Length Max       0.267014
Fwd Packet Length Mean      0.226251
Fwd Packet Length Std       0.302648
Bwd Packet Length Max       0.082755
Bwd Packet Length Mean      0.017201
Bwd Packet Length Std       0.127629
Flow Bytes/s               -0.078757
Flow Packets/s             -0.120632
Flow IAT Mean               0.148807
Flow IAT Std                0.246742
Flow IAT Max                0.162000
Flow IAT Min               -0.309198
Fwd IAT Total               0.204756
Fwd IAT Mean                0.221853
Fwd IAT Std                -0.002423
Fwd IAT Max                 0.178686
Fwd IAT Min                 0.057986
Bwd IAT Total               0.017300
Bwd IAT Mean                0.057588
Bwd IAT Std                 0.029104
Bwd IAT Max                 0.011774
Bwd IAT Min                 0.107613
Fwd Header Length           0.327892
Bwd Header Length           0.264664
Fwd Packets/s              -0.127855
Bwd Packets/s              -0.072828
Packet Length Max           0.154635
Packet Length Mean          0.099643
Packet Length Std           0.152382
Packet Length Variance      0.172997
Avg Packet Size             0.058608
Avg Fwd Segment Size        0.226251
Avg Bwd Segment Size        0.017201
Subflow Fwd Packets         0.151001
Subflow Fwd Bytes           0.200562
Subflow Bwd Packets         0.124126
Subflow Bwd Bytes           0.053893
Init Fwd Win Bytes          0.032016
Init Bwd Win Bytes         -0.034003
Fwd Act Data Packets        0.049872
Fwd Seg Size Min            1.000000
attack_id                   0.109920
Name: Fwd Seg Size Min, dtype: float64
Row 46: Flow Duration               0.026159
Total Fwd Packets          -0.039465
Total Backward Packets     -0.064490
Fwd Packets Length Total   -0.085335
Bwd Packets Length Total   -0.035668
Fwd Packet Length Max      -0.114476
Fwd Packet Length Mean     -0.196100
Fwd Packet Length Std      -0.141036
Bwd Packet Length Max      -0.064293
Bwd Packet Length Mean     -0.085765
Bwd Packet Length Std      -0.039774
Flow Bytes/s               -0.014417
Flow Packets/s              0.040890
Flow IAT Mean               0.016617
Flow IAT Std                0.012121
Flow IAT Max               -0.049590
Flow IAT Min               -0.098165
Fwd IAT Total              -0.017367
Fwd IAT Mean               -0.019111
Fwd IAT Std                -0.121112
Fwd IAT Max                -0.085004
Fwd IAT Min                 0.108564
Bwd IAT Total               0.046394
Bwd IAT Mean               -0.032041
Bwd IAT Std                 0.013587
Bwd IAT Max                 0.098360
Bwd IAT Min                -0.056871
Fwd Header Length          -0.021366
Bwd Header Length          -0.029029
Fwd Packets/s               0.037160
Bwd Packets/s              -0.080766
Packet Length Max          -0.061398
Packet Length Mean         -0.098992
Packet Length Std          -0.097046
Packet Length Variance     -0.023701
Avg Packet Size            -0.103647
Avg Fwd Segment Size       -0.196100
Avg Bwd Segment Size       -0.085765
Subflow Fwd Packets        -0.039465
Subflow Fwd Bytes          -0.085335
Subflow Bwd Packets        -0.064490
Subflow Bwd Bytes          -0.035668
Init Fwd Win Bytes         -0.024570
Init Bwd Win Bytes         -0.188538
Fwd Act Data Packets       -0.086071
Fwd Seg Size Min            0.109920
attack_id                   1.000000
Name: attack_id, dtype: float64
In [125]:
#Renaming columns to remove space and replace it with underscore (_)
sampled_cic_df.columns=[col.replace(' ','_') for col in sampled_cic_df.columns]
In [126]:
sampled_cic_df.columns
Out[126]:
Index(['Flow_Duration', 'Total_Fwd_Packets', 'Total_Backward_Packets',
       'Fwd_Packets_Length_Total', 'Bwd_Packets_Length_Total',
       'Fwd_Packet_Length_Max', 'Fwd_Packet_Length_Mean',
       'Fwd_Packet_Length_Std', 'Bwd_Packet_Length_Max',
       'Bwd_Packet_Length_Mean', 'Bwd_Packet_Length_Std', 'Flow_Bytes/s',
       'Flow_Packets/s', 'Flow_IAT_Mean', 'Flow_IAT_Std', 'Flow_IAT_Max',
       'Flow_IAT_Min', 'Fwd_IAT_Total', 'Fwd_IAT_Mean', 'Fwd_IAT_Std',
       'Fwd_IAT_Max', 'Fwd_IAT_Min', 'Bwd_IAT_Total', 'Bwd_IAT_Mean',
       'Bwd_IAT_Std', 'Bwd_IAT_Max', 'Bwd_IAT_Min', 'Fwd_Header_Length',
       'Bwd_Header_Length', 'Fwd_Packets/s', 'Bwd_Packets/s',
       'Packet_Length_Max', 'Packet_Length_Mean', 'Packet_Length_Std',
       'Packet_Length_Variance', 'Avg_Packet_Size', 'Avg_Fwd_Segment_Size',
       'Avg_Bwd_Segment_Size', 'Subflow_Fwd_Packets', 'Subflow_Fwd_Bytes',
       'Subflow_Bwd_Packets', 'Subflow_Bwd_Bytes', 'Init_Fwd_Win_Bytes',
       'Init_Bwd_Win_Bytes', 'Fwd_Act_Data_Packets', 'Fwd_Seg_Size_Min',
       'ClassLabel', 'isMalicious', 'attack_id'],
      dtype='object')
In [127]:
cic_df.columns=[col.replace(' ','_') for col in cic_df.columns]
In [128]:
columns_list=sampled_cic_df.columns.tolist()
In [129]:
sampled_cic_df.head()
Out[129]:
Flow_Duration Total_Fwd_Packets Total_Backward_Packets Fwd_Packets_Length_Total Bwd_Packets_Length_Total Fwd_Packet_Length_Max Fwd_Packet_Length_Mean Fwd_Packet_Length_Std Bwd_Packet_Length_Max Bwd_Packet_Length_Mean ... Subflow_Fwd_Bytes Subflow_Bwd_Packets Subflow_Bwd_Bytes Init_Fwd_Win_Bytes Init_Bwd_Win_Bytes Fwd_Act_Data_Packets Fwd_Seg_Size_Min ClassLabel isMalicious attack_id
5968290 3813760.0 5.0 3.0 935.0 397.0 935.0 187.000 418.144714 397.0 132.333328 ... 935.0 3.0 397.0 219.0 211.0 1.0 32.0 Benign 0 0
8285216 396839.0 2.0 0.0 0.0 0.0 0.0 0.000 0.000000 0.0 0.000000 ... 0.0 0.0 0.0 63326.0 235.0 0.0 20.0 Benign 0 0
8349977 1914354.0 8.0 7.0 1144.0 1581.0 677.0 143.000 227.969925 1173.0 225.857147 ... 1144.0 7.0 1581.0 8192.0 62856.0 5.0 20.0 Benign 0 0
7180832 4002.0 6.0 0.0 2064.0 0.0 440.0 44.000 148.722565 0.0 0.000000 ... 2064.0 0.0 0.0 8192.0 235.0 5.0 8.0 DDoS 1 3
2324438 5368715.0 8.0 6.0 355.0 232.0 198.0 44.375 75.864426 1460.0 108.000000 ... 355.0 6.0 232.0 8192.0 123.0 3.0 20.0 Benign 0 0

5 rows × 49 columns

In [130]:
columns_equal_min_and_Q1=[]
columns_equal_Q1_and_Q3=[]
columns_equal_Q3_and_max=[]
for col in columns_list:
    if(col!="isMalicious" and col!="ClassLabel" and col!="attack_id"):
        min_value=sampled_cic_df[col].min()
        p25_value=sampled_cic_df[col].quantile(0.25)
        p75_value=sampled_cic_df[col].quantile(0.75)
        max_value=sampled_cic_df[col].max()
        if(min_value==p25_value):
            columns_equal_min_and_Q1.append(col)
        if(p25_value==p75_value):
            columns_equal_Q1_and_Q3.append(col)
        if(p75_value==max_value):
            columns_equal_Q3_and_max.append(col)
In [131]:
print("List of features having equal min and Q1 value:\n",columns_equal_min_and_Q1)
List of features having equal min and Q1 value:
 ['Bwd_Packets_Length_Total', 'Fwd_Packet_Length_Std', 'Bwd_Packet_Length_Max', 'Bwd_Packet_Length_Mean', 'Bwd_Packet_Length_Std', 'Flow_IAT_Std', 'Fwd_IAT_Std', 'Bwd_IAT_Total', 'Bwd_IAT_Mean', 'Bwd_IAT_Std', 'Bwd_IAT_Max', 'Bwd_IAT_Min', 'Avg_Bwd_Segment_Size', 'Subflow_Bwd_Bytes', 'Fwd_Act_Data_Packets']
In [132]:
print("List of features having equal Q1 and Q3 value:\n",columns_equal_Q1_and_Q3)
List of features having equal Q1 and Q3 value:
 ['Init_Fwd_Win_Bytes', 'Fwd_Seg_Size_Min']
In [133]:
print("List of features having equal Q3 and max value:\n",columns_equal_Q3_and_max)
List of features having equal Q3 and max value:
 []

From the above results we observed: -

  1. Following features in the sampled dataset have equal minimum and Q1 value: -
    • Bwd_Packets_Length_Total
    • Fwd_Packet_Length_Std
    • Bwd_Packet_Length_Max
    • Bwd_Packet_Length_Mean
    • Bwd_Packet_Length_Std
    • Flow_IAT_Std
    • Fwd_IAT_Std
    • Bwd_IAT_Total
    • Bwd_IAT_Mean
    • Bwd_IAT_Std
    • Bwd_IAT_Max
    • Bwd_IAT_Min
    • Avg_Bwd_Segment_Size
    • Subflow_Bwd_Bytes
    • Fwd_Act_Data_Packets
  • For the above features, a large number of records are clustered in lower range. Thus, these features may have many zero values, or many constant values in lower range of data-points.
  • Since the features are concenterated on lower range of data, they are positively skewed.
  • If the features have significant number of records with same value, we will need to analyze if they can help to differentiate between Malicious or Benign records.
  1. Following features in the sampled dataset have equal Q1 and Q3 value: -
    • Init_Fwd_Win_Bytes
    • Fwd_Seg_Size_Min
  • For the above features, a large number of records are clustered at a single value. Thus, these features may have very low variability and may have many constant values.
  • If the features have significant number of records with same value, we will need to analyze if they can help to differentiate between Malicious or Benign records.
  1. There are no features in the sampled dataset having equal Q3 and maximum value.
  • Thus, there are no features having same value for 75th percentile and maximum value.
  • As the result, all features in upper range have high variability and are spread out (not concenterated around a single value or a group of values).
  • Thus, we do not have any features which are negatively skewed.
In [134]:
#Fetching the Q1 value in list of features having equal min and Q1 value
for col in columns_equal_min_and_Q1:
    print("Feature name: ",col," , Q1 value: ",sampled_cic_df[col].quantile(0.25))
Feature name:  Bwd_Packets_Length_Total  , Q1 value:  0.0
Feature name:  Fwd_Packet_Length_Std  , Q1 value:  0.0
Feature name:  Bwd_Packet_Length_Max  , Q1 value:  0.0
Feature name:  Bwd_Packet_Length_Mean  , Q1 value:  0.0
Feature name:  Bwd_Packet_Length_Std  , Q1 value:  0.0
Feature name:  Flow_IAT_Std  , Q1 value:  0.0
Feature name:  Fwd_IAT_Std  , Q1 value:  0.0
Feature name:  Bwd_IAT_Total  , Q1 value:  0.0
Feature name:  Bwd_IAT_Mean  , Q1 value:  0.0
Feature name:  Bwd_IAT_Std  , Q1 value:  0.0
Feature name:  Bwd_IAT_Max  , Q1 value:  0.0
Feature name:  Bwd_IAT_Min  , Q1 value:  0.0
Feature name:  Avg_Bwd_Segment_Size  , Q1 value:  0.0
Feature name:  Subflow_Bwd_Bytes  , Q1 value:  0.0
Feature name:  Fwd_Act_Data_Packets  , Q1 value:  0.0

From the above results, we observed 25% of the values among list of features having equal min and Q1 value are equal to 0.

In [135]:
#Fetching the Q1 value and Q3 value in list of features having equal Q1 and Q3 value
for col in columns_equal_Q1_and_Q3:
    print("Feature name: ",col," , Q1 value: ",sampled_cic_df[col].quantile(0.25), " , Q3 value: ",sampled_cic_df[col].quantile(0.75))
Feature name:  Init_Fwd_Win_Bytes  , Q1 value:  8192.0  , Q3 value:  8192.0
Feature name:  Fwd_Seg_Size_Min  , Q1 value:  20.0  , Q3 value:  20.0
In [136]:
sampled_cic_df['Init_Fwd_Win_Bytes'].median()
Out[136]:
8192.0
In [137]:
sampled_cic_df['Fwd_Seg_Size_Min'].median()
Out[137]:
20.0

From the above results we observed 50% of the values in the below two features have constant value: -

  1. Init_Fwd_Win_Bytes : 8192.0
  2. Fwd_Seg_Size_Min : 20.0
In [138]:
#Plotting charts to see if zero values have any differentiation among the two classes
for col in columns_equal_min_and_Q1:
    zero_class="_zero"
    col_name=col+zero_class
    sampled_cic_df[col_name]=(sampled_cic_df[col] == 0).astype(int)
    grouped_data = sampled_cic_df.groupby([col_name, 'isMalicious']).size().unstack()
    ax = grouped_data.plot(kind='bar', stacked=True) 
    x_label_value=col+ "(0: Non-zero, 1: Zero)"
    plt.xlabel(x_label_value) 
    plt.ylabel('Count') 
    title_value="Comparison of isMalicious for Zero and Non-Zero "+col
    plt.title(title_value,pad=20) 
    labels = ['No', 'Yes'] 
    handles, _ = ax.get_legend_handles_labels() 
    ax.legend(handles, labels, title='isMalicious')
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [139]:
#Plotting charts to see if equal Q1, median and Q3 values have any differentiation among the two classes
for col in columns_equal_Q1_and_Q3:
    mid_range_class="_mid_range"
    col_name=col+mid_range_class
    mid_value=sampled_cic_df[col].median()
    sampled_cic_df[col_name]=(sampled_cic_df[col] == mid_value).astype(int)
    grouped_data = sampled_cic_df.groupby([col_name, 'isMalicious']).size().unstack()
    ax = grouped_data.plot(kind='bar', stacked=True) 
    x_label_value=col+ "(0: Not mid-range, 1: Mid-range)"
    plt.xlabel(x_label_value) 
    plt.ylabel('Count') 
    title_value="Comparison of isMalicious for Mid-range and Non-mid-range "+col
    plt.title(title_value,pad=20) 
    labels = ['No', 'Yes'] 
    handles, _ = ax.get_legend_handles_labels() 
    ax.legend(handles, labels, title='isMalicious')
    plt.show()
No description has been provided for this image
No description has been provided for this image

Thus, we observed that creating new categories for two sets of features does not help us get any useful information to differentiate Malicious events from Benign events.

In [140]:
sampled_cic_df.ClassLabel.value_counts()
Out[140]:
ClassLabel
Benign          1437467
DDoS             246982
DoS               79186
Botnet            29348
Bruteforce        20546
Infiltration      18870
Webattack           625
Portscan            430
Name: count, dtype: int64
In [141]:
labels_and_counts={
    'Benign': 1437467,
    'DDoS': 246982,
    'DoS': 79186,
    'Botnet': 29348,
    'Bruteforce': 20546,
    'Infiltration': 18870,
    'Webattack': 625,
    'Portscan': 430
}
In [142]:
#Checking if rows with each label are unique in Sampled dataset
for label, count in labels_and_counts.items():
    rows_with_labels=sampled_cic_df[sampled_cic_df['ClassLabel']==label]
    if not rows_with_labels.duplicated().any():
        print(f"All {count} rows with label '{label}' are unique.")
    else:
        print(f"There are duplicates in {count} rows with label '{label}'.")
There are duplicates in 1437467 rows with label 'Benign'.
There are duplicates in 246982 rows with label 'DDoS'.
There are duplicates in 79186 rows with label 'DoS'.
All 29348 rows with label 'Botnet' are unique.
All 20546 rows with label 'Bruteforce' are unique.
There are duplicates in 18870 rows with label 'Infiltration'.
There are duplicates in 625 rows with label 'Webattack'.
All 430 rows with label 'Portscan' are unique.
In [143]:
sampled_cic_df.head()
Out[143]:
Flow_Duration Total_Fwd_Packets Total_Backward_Packets Fwd_Packets_Length_Total Bwd_Packets_Length_Total Fwd_Packet_Length_Max Fwd_Packet_Length_Mean Fwd_Packet_Length_Std Bwd_Packet_Length_Max Bwd_Packet_Length_Mean ... Bwd_IAT_Total_zero Bwd_IAT_Mean_zero Bwd_IAT_Std_zero Bwd_IAT_Max_zero Bwd_IAT_Min_zero Avg_Bwd_Segment_Size_zero Subflow_Bwd_Bytes_zero Fwd_Act_Data_Packets_zero Init_Fwd_Win_Bytes_mid_range Fwd_Seg_Size_Min_mid_range
5968290 3813760.0 5.0 3.0 935.0 397.0 935.0 187.000 418.144714 397.0 132.333328 ... 0 0 0 0 0 0 0 0 0 0
8285216 396839.0 2.0 0.0 0.0 0.0 0.0 0.000 0.000000 0.0 0.000000 ... 1 1 1 1 1 1 1 1 0 1
8349977 1914354.0 8.0 7.0 1144.0 1581.0 677.0 143.000 227.969925 1173.0 225.857147 ... 0 0 0 0 0 0 0 0 1 1
7180832 4002.0 6.0 0.0 2064.0 0.0 440.0 44.000 148.722565 0.0 0.000000 ... 1 1 1 1 1 1 1 0 1 0
2324438 5368715.0 8.0 6.0 355.0 232.0 198.0 44.375 75.864426 1460.0 108.000000 ... 0 0 0 0 1 0 0 0 1 1

5 rows × 66 columns

In [144]:
suffix="_zero"
sampled_cic_df=sampled_cic_df.drop(columns=sampled_cic_df.filter(like=suffix).columns)
In [145]:
sampled_cic_df.head()
Out[145]:
Flow_Duration Total_Fwd_Packets Total_Backward_Packets Fwd_Packets_Length_Total Bwd_Packets_Length_Total Fwd_Packet_Length_Max Fwd_Packet_Length_Mean Fwd_Packet_Length_Std Bwd_Packet_Length_Max Bwd_Packet_Length_Mean ... Subflow_Bwd_Bytes Init_Fwd_Win_Bytes Init_Bwd_Win_Bytes Fwd_Act_Data_Packets Fwd_Seg_Size_Min ClassLabel isMalicious attack_id Init_Fwd_Win_Bytes_mid_range Fwd_Seg_Size_Min_mid_range
5968290 3813760.0 5.0 3.0 935.0 397.0 935.0 187.000 418.144714 397.0 132.333328 ... 397.0 219.0 211.0 1.0 32.0 Benign 0 0 0 0
8285216 396839.0 2.0 0.0 0.0 0.0 0.0 0.000 0.000000 0.0 0.000000 ... 0.0 63326.0 235.0 0.0 20.0 Benign 0 0 0 1
8349977 1914354.0 8.0 7.0 1144.0 1581.0 677.0 143.000 227.969925 1173.0 225.857147 ... 1581.0 8192.0 62856.0 5.0 20.0 Benign 0 0 1 1
7180832 4002.0 6.0 0.0 2064.0 0.0 440.0 44.000 148.722565 0.0 0.000000 ... 0.0 8192.0 235.0 5.0 8.0 DDoS 1 3 1 0
2324438 5368715.0 8.0 6.0 355.0 232.0 198.0 44.375 75.864426 1460.0 108.000000 ... 232.0 8192.0 123.0 3.0 20.0 Benign 0 0 1 1

5 rows × 51 columns

In [146]:
suffix="_mid_range"
sampled_cic_df=sampled_cic_df.drop(columns=sampled_cic_df.filter(like=suffix).columns)
In [147]:
sampled_cic_df.head()
Out[147]:
Flow_Duration Total_Fwd_Packets Total_Backward_Packets Fwd_Packets_Length_Total Bwd_Packets_Length_Total Fwd_Packet_Length_Max Fwd_Packet_Length_Mean Fwd_Packet_Length_Std Bwd_Packet_Length_Max Bwd_Packet_Length_Mean ... Subflow_Fwd_Bytes Subflow_Bwd_Packets Subflow_Bwd_Bytes Init_Fwd_Win_Bytes Init_Bwd_Win_Bytes Fwd_Act_Data_Packets Fwd_Seg_Size_Min ClassLabel isMalicious attack_id
5968290 3813760.0 5.0 3.0 935.0 397.0 935.0 187.000 418.144714 397.0 132.333328 ... 935.0 3.0 397.0 219.0 211.0 1.0 32.0 Benign 0 0
8285216 396839.0 2.0 0.0 0.0 0.0 0.0 0.000 0.000000 0.0 0.000000 ... 0.0 0.0 0.0 63326.0 235.0 0.0 20.0 Benign 0 0
8349977 1914354.0 8.0 7.0 1144.0 1581.0 677.0 143.000 227.969925 1173.0 225.857147 ... 1144.0 7.0 1581.0 8192.0 62856.0 5.0 20.0 Benign 0 0
7180832 4002.0 6.0 0.0 2064.0 0.0 440.0 44.000 148.722565 0.0 0.000000 ... 2064.0 0.0 0.0 8192.0 235.0 5.0 8.0 DDoS 1 3
2324438 5368715.0 8.0 6.0 355.0 232.0 198.0 44.375 75.864426 1460.0 108.000000 ... 355.0 6.0 232.0 8192.0 123.0 3.0 20.0 Benign 0 0

5 rows × 49 columns

In [148]:
sampled_cic_df.columns
Out[148]:
Index(['Flow_Duration', 'Total_Fwd_Packets', 'Total_Backward_Packets',
       'Fwd_Packets_Length_Total', 'Bwd_Packets_Length_Total',
       'Fwd_Packet_Length_Max', 'Fwd_Packet_Length_Mean',
       'Fwd_Packet_Length_Std', 'Bwd_Packet_Length_Max',
       'Bwd_Packet_Length_Mean', 'Bwd_Packet_Length_Std', 'Flow_Bytes/s',
       'Flow_Packets/s', 'Flow_IAT_Mean', 'Flow_IAT_Std', 'Flow_IAT_Max',
       'Flow_IAT_Min', 'Fwd_IAT_Total', 'Fwd_IAT_Mean', 'Fwd_IAT_Std',
       'Fwd_IAT_Max', 'Fwd_IAT_Min', 'Bwd_IAT_Total', 'Bwd_IAT_Mean',
       'Bwd_IAT_Std', 'Bwd_IAT_Max', 'Bwd_IAT_Min', 'Fwd_Header_Length',
       'Bwd_Header_Length', 'Fwd_Packets/s', 'Bwd_Packets/s',
       'Packet_Length_Max', 'Packet_Length_Mean', 'Packet_Length_Std',
       'Packet_Length_Variance', 'Avg_Packet_Size', 'Avg_Fwd_Segment_Size',
       'Avg_Bwd_Segment_Size', 'Subflow_Fwd_Packets', 'Subflow_Fwd_Bytes',
       'Subflow_Bwd_Packets', 'Subflow_Bwd_Bytes', 'Init_Fwd_Win_Bytes',
       'Init_Bwd_Win_Bytes', 'Fwd_Act_Data_Packets', 'Fwd_Seg_Size_Min',
       'ClassLabel', 'isMalicious', 'attack_id'],
      dtype='object')
In [149]:
sampled_cic_df.shape
Out[149]:
(1833454, 49)
In [150]:
sampled_cic_df.ClassLabel.value_counts()
Out[150]:
ClassLabel
Benign          1437467
DDoS             246982
DoS               79186
Botnet            29348
Bruteforce        20546
Infiltration      18870
Webattack           625
Portscan            430
Name: count, dtype: int64
In [151]:
sampled_cic_df.drop_duplicates(subset=sampled_cic_df.columns[:-1], keep='first')
sampled_cic_df.shape
Out[151]:
(1833454, 49)
In [152]:
#List of labels to keep
labels_to_keep=['Benign','DDoS','Botnet','Bruteforce']

#Filtering the sampled dataset to only keep rows with the above set of labels
sampled_cic_df=sampled_cic_df[sampled_cic_df['ClassLabel'].isin(labels_to_keep)]
In [153]:
sampled_cic_df.shape
Out[153]:
(1734343, 49)
In [154]:
sampled_cic_df.head()
Out[154]:
Flow_Duration Total_Fwd_Packets Total_Backward_Packets Fwd_Packets_Length_Total Bwd_Packets_Length_Total Fwd_Packet_Length_Max Fwd_Packet_Length_Mean Fwd_Packet_Length_Std Bwd_Packet_Length_Max Bwd_Packet_Length_Mean ... Subflow_Fwd_Bytes Subflow_Bwd_Packets Subflow_Bwd_Bytes Init_Fwd_Win_Bytes Init_Bwd_Win_Bytes Fwd_Act_Data_Packets Fwd_Seg_Size_Min ClassLabel isMalicious attack_id
5968290 3813760.0 5.0 3.0 935.0 397.0 935.0 187.000 418.144714 397.0 132.333328 ... 935.0 3.0 397.0 219.0 211.0 1.0 32.0 Benign 0 0
8285216 396839.0 2.0 0.0 0.0 0.0 0.0 0.000 0.000000 0.0 0.000000 ... 0.0 0.0 0.0 63326.0 235.0 0.0 20.0 Benign 0 0
8349977 1914354.0 8.0 7.0 1144.0 1581.0 677.0 143.000 227.969925 1173.0 225.857147 ... 1144.0 7.0 1581.0 8192.0 62856.0 5.0 20.0 Benign 0 0
7180832 4002.0 6.0 0.0 2064.0 0.0 440.0 44.000 148.722565 0.0 0.000000 ... 2064.0 0.0 0.0 8192.0 235.0 5.0 8.0 DDoS 1 3
2324438 5368715.0 8.0 6.0 355.0 232.0 198.0 44.375 75.864426 1460.0 108.000000 ... 355.0 6.0 232.0 8192.0 123.0 3.0 20.0 Benign 0 0

5 rows × 49 columns

In [155]:
sampled_cic_df.ClassLabel.value_counts()
Out[155]:
ClassLabel
Benign        1437467
DDoS           246982
Botnet          29348
Bruteforce      20546
Name: count, dtype: int64
  • Assuming that the sampled dataset's distribution of Class Labels is same as main dataset's distribution of Class Labels.
  • We shall filter the data and keep the rows with only with 4 types of attacks: 'Benign','DDoS','Botnet','Bruteforce' and Benign records.
  • However, due to large file size of main dataset, we get out of memory error when we try to drop the rows.
  • Thus, we shall perform our modelling on sampled dataset.
In [156]:
sampled_cic_df.columns
Out[156]:
Index(['Flow_Duration', 'Total_Fwd_Packets', 'Total_Backward_Packets',
       'Fwd_Packets_Length_Total', 'Bwd_Packets_Length_Total',
       'Fwd_Packet_Length_Max', 'Fwd_Packet_Length_Mean',
       'Fwd_Packet_Length_Std', 'Bwd_Packet_Length_Max',
       'Bwd_Packet_Length_Mean', 'Bwd_Packet_Length_Std', 'Flow_Bytes/s',
       'Flow_Packets/s', 'Flow_IAT_Mean', 'Flow_IAT_Std', 'Flow_IAT_Max',
       'Flow_IAT_Min', 'Fwd_IAT_Total', 'Fwd_IAT_Mean', 'Fwd_IAT_Std',
       'Fwd_IAT_Max', 'Fwd_IAT_Min', 'Bwd_IAT_Total', 'Bwd_IAT_Mean',
       'Bwd_IAT_Std', 'Bwd_IAT_Max', 'Bwd_IAT_Min', 'Fwd_Header_Length',
       'Bwd_Header_Length', 'Fwd_Packets/s', 'Bwd_Packets/s',
       'Packet_Length_Max', 'Packet_Length_Mean', 'Packet_Length_Std',
       'Packet_Length_Variance', 'Avg_Packet_Size', 'Avg_Fwd_Segment_Size',
       'Avg_Bwd_Segment_Size', 'Subflow_Fwd_Packets', 'Subflow_Fwd_Bytes',
       'Subflow_Bwd_Packets', 'Subflow_Bwd_Bytes', 'Init_Fwd_Win_Bytes',
       'Init_Bwd_Win_Bytes', 'Fwd_Act_Data_Packets', 'Fwd_Seg_Size_Min',
       'ClassLabel', 'isMalicious', 'attack_id'],
      dtype='object')
  • Since we had perfomed Label encoding on ClassLabel and stored the results earlier, the encoded values for type of attack will have gap after we dropped the records.
  • Thus, we will drop the column: attack_id, and redo label encdoing on the sampled dataset.
In [157]:
sampled_cic_df=sampled_cic_df.drop('attack_id',axis=1)
In [158]:
sampled_cic_df.columns
Out[158]:
Index(['Flow_Duration', 'Total_Fwd_Packets', 'Total_Backward_Packets',
       'Fwd_Packets_Length_Total', 'Bwd_Packets_Length_Total',
       'Fwd_Packet_Length_Max', 'Fwd_Packet_Length_Mean',
       'Fwd_Packet_Length_Std', 'Bwd_Packet_Length_Max',
       'Bwd_Packet_Length_Mean', 'Bwd_Packet_Length_Std', 'Flow_Bytes/s',
       'Flow_Packets/s', 'Flow_IAT_Mean', 'Flow_IAT_Std', 'Flow_IAT_Max',
       'Flow_IAT_Min', 'Fwd_IAT_Total', 'Fwd_IAT_Mean', 'Fwd_IAT_Std',
       'Fwd_IAT_Max', 'Fwd_IAT_Min', 'Bwd_IAT_Total', 'Bwd_IAT_Mean',
       'Bwd_IAT_Std', 'Bwd_IAT_Max', 'Bwd_IAT_Min', 'Fwd_Header_Length',
       'Bwd_Header_Length', 'Fwd_Packets/s', 'Bwd_Packets/s',
       'Packet_Length_Max', 'Packet_Length_Mean', 'Packet_Length_Std',
       'Packet_Length_Variance', 'Avg_Packet_Size', 'Avg_Fwd_Segment_Size',
       'Avg_Bwd_Segment_Size', 'Subflow_Fwd_Packets', 'Subflow_Fwd_Bytes',
       'Subflow_Bwd_Packets', 'Subflow_Bwd_Bytes', 'Init_Fwd_Win_Bytes',
       'Init_Bwd_Win_Bytes', 'Fwd_Act_Data_Packets', 'Fwd_Seg_Size_Min',
       'ClassLabel', 'isMalicious'],
      dtype='object')
In [159]:
sampled_cic_df["attack_id"]=le.fit_transform(sampled_cic_df["ClassLabel"])
label_mapping = dict(zip(le.classes_, le.transform(le.classes_))) 
print("Attack id of each distinct value in field ClassLabel:", label_mapping)
Attack id of each distinct value in field ClassLabel: {'Benign': 0, 'Botnet': 1, 'Bruteforce': 2, 'DDoS': 3}
In [160]:
sampled_cic_df.shape
Out[160]:
(1734343, 49)
In [161]:
sampled_cic_df.head()
Out[161]:
Flow_Duration Total_Fwd_Packets Total_Backward_Packets Fwd_Packets_Length_Total Bwd_Packets_Length_Total Fwd_Packet_Length_Max Fwd_Packet_Length_Mean Fwd_Packet_Length_Std Bwd_Packet_Length_Max Bwd_Packet_Length_Mean ... Subflow_Fwd_Bytes Subflow_Bwd_Packets Subflow_Bwd_Bytes Init_Fwd_Win_Bytes Init_Bwd_Win_Bytes Fwd_Act_Data_Packets Fwd_Seg_Size_Min ClassLabel isMalicious attack_id
5968290 3813760.0 5.0 3.0 935.0 397.0 935.0 187.000 418.144714 397.0 132.333328 ... 935.0 3.0 397.0 219.0 211.0 1.0 32.0 Benign 0 0
8285216 396839.0 2.0 0.0 0.0 0.0 0.0 0.000 0.000000 0.0 0.000000 ... 0.0 0.0 0.0 63326.0 235.0 0.0 20.0 Benign 0 0
8349977 1914354.0 8.0 7.0 1144.0 1581.0 677.0 143.000 227.969925 1173.0 225.857147 ... 1144.0 7.0 1581.0 8192.0 62856.0 5.0 20.0 Benign 0 0
7180832 4002.0 6.0 0.0 2064.0 0.0 440.0 44.000 148.722565 0.0 0.000000 ... 2064.0 0.0 0.0 8192.0 235.0 5.0 8.0 DDoS 1 3
2324438 5368715.0 8.0 6.0 355.0 232.0 198.0 44.375 75.864426 1460.0 108.000000 ... 355.0 6.0 232.0 8192.0 123.0 3.0 20.0 Benign 0 0

5 rows × 49 columns

In [162]:
sampled_cic_df.tail()
Out[162]:
Flow_Duration Total_Fwd_Packets Total_Backward_Packets Fwd_Packets_Length_Total Bwd_Packets_Length_Total Fwd_Packet_Length_Max Fwd_Packet_Length_Mean Fwd_Packet_Length_Std Bwd_Packet_Length_Max Bwd_Packet_Length_Mean ... Subflow_Fwd_Bytes Subflow_Bwd_Packets Subflow_Bwd_Bytes Init_Fwd_Win_Bytes Init_Bwd_Win_Bytes Fwd_Act_Data_Packets Fwd_Seg_Size_Min ClassLabel isMalicious attack_id
1606912 189583.0 10.0 8.0 496.0 232.0 192.0 49.599998 77.654793 1460.0 108.0 ... 496.0 8.0 232.0 8192.0 31.0 4.0 20.0 Benign 0 0
7433839 3000787.0 4.0 0.0 2064.0 0.0 516.0 44.000000 0.000000 0.0 0.0 ... 2064.0 0.0 0.0 8192.0 235.0 3.0 20.0 DDoS 1 3
2510144 40.0 1.0 1.0 0.0 0.0 0.0 0.000000 0.000000 0.0 0.0 ... 0.0 1.0 0.0 8192.0 16625.0 0.0 8.0 Benign 0 0
760618 396839.0 2.0 0.0 0.0 0.0 0.0 0.000000 0.000000 0.0 0.0 ... 0.0 0.0 0.0 279.0 235.0 0.0 20.0 Benign 0 0
7134908 4097.0 3.0 0.0 97.0 0.0 440.0 44.000000 27.430199 0.0 0.0 ... 97.0 0.0 0.0 8192.0 235.0 1.0 20.0 DDoS 1 3

5 rows × 49 columns

In [163]:
#We will drop the column: ClassLabel. Thus, we will have two target features: isMalicious, attack_id
sampled_cic_df=sampled_cic_df.drop('ClassLabel',axis=1)
In [164]:
#We will drop the column: Init Bwd Win Bytes. This is based on obbservations from Pyramid chart.
sampled_cic_df=sampled_cic_df.drop('Init_Bwd_Win_Bytes',axis=1)
In [165]:
sampled_cic_df.shape
Out[165]:
(1734343, 47)
In [166]:
sampled_cic_df.head()
Out[166]:
Flow_Duration Total_Fwd_Packets Total_Backward_Packets Fwd_Packets_Length_Total Bwd_Packets_Length_Total Fwd_Packet_Length_Max Fwd_Packet_Length_Mean Fwd_Packet_Length_Std Bwd_Packet_Length_Max Bwd_Packet_Length_Mean ... Avg_Bwd_Segment_Size Subflow_Fwd_Packets Subflow_Fwd_Bytes Subflow_Bwd_Packets Subflow_Bwd_Bytes Init_Fwd_Win_Bytes Fwd_Act_Data_Packets Fwd_Seg_Size_Min isMalicious attack_id
5968290 3813760.0 5.0 3.0 935.0 397.0 935.0 187.000 418.144714 397.0 132.333328 ... 132.333328 5.0 935.0 3.0 397.0 219.0 1.0 32.0 0 0
8285216 396839.0 2.0 0.0 0.0 0.0 0.0 0.000 0.000000 0.0 0.000000 ... 0.000000 2.0 0.0 0.0 0.0 63326.0 0.0 20.0 0 0
8349977 1914354.0 8.0 7.0 1144.0 1581.0 677.0 143.000 227.969925 1173.0 225.857147 ... 225.857147 8.0 1144.0 7.0 1581.0 8192.0 5.0 20.0 0 0
7180832 4002.0 6.0 0.0 2064.0 0.0 440.0 44.000 148.722565 0.0 0.000000 ... 0.000000 6.0 2064.0 0.0 0.0 8192.0 5.0 8.0 1 3
2324438 5368715.0 8.0 6.0 355.0 232.0 198.0 44.375 75.864426 1460.0 108.000000 ... 108.000000 8.0 355.0 6.0 232.0 8192.0 3.0 20.0 0 0

5 rows × 47 columns

In [167]:
sampled_cic_df.tail()
Out[167]:
Flow_Duration Total_Fwd_Packets Total_Backward_Packets Fwd_Packets_Length_Total Bwd_Packets_Length_Total Fwd_Packet_Length_Max Fwd_Packet_Length_Mean Fwd_Packet_Length_Std Bwd_Packet_Length_Max Bwd_Packet_Length_Mean ... Avg_Bwd_Segment_Size Subflow_Fwd_Packets Subflow_Fwd_Bytes Subflow_Bwd_Packets Subflow_Bwd_Bytes Init_Fwd_Win_Bytes Fwd_Act_Data_Packets Fwd_Seg_Size_Min isMalicious attack_id
1606912 189583.0 10.0 8.0 496.0 232.0 192.0 49.599998 77.654793 1460.0 108.0 ... 108.0 10.0 496.0 8.0 232.0 8192.0 4.0 20.0 0 0
7433839 3000787.0 4.0 0.0 2064.0 0.0 516.0 44.000000 0.000000 0.0 0.0 ... 0.0 4.0 2064.0 0.0 0.0 8192.0 3.0 20.0 1 3
2510144 40.0 1.0 1.0 0.0 0.0 0.0 0.000000 0.000000 0.0 0.0 ... 0.0 1.0 0.0 1.0 0.0 8192.0 0.0 8.0 0 0
760618 396839.0 2.0 0.0 0.0 0.0 0.0 0.000000 0.000000 0.0 0.0 ... 0.0 2.0 0.0 0.0 0.0 279.0 0.0 20.0 0 0
7134908 4097.0 3.0 0.0 97.0 0.0 440.0 44.000000 27.430199 0.0 0.0 ... 0.0 3.0 97.0 0.0 0.0 8192.0 1.0 20.0 1 3

5 rows × 47 columns

In [168]:
#We will store the data in sampled_cic_df in a new .parquet file which can later be used for feature selection and training the models.
sampled_cic_df.to_parquet('processed_dataset.parquet')
In [ ]: